# Classification Process

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Importing the Necessary Libraries<br></p>

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


#Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

import time

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Creating a Pandas DataFrame from a CSV file<br></p>

In [2]:
def CreateDataFrame(types,subtype, level):
    data = pd.read_csv('../Extraction/'+types+"/"+subtype+"/"+level+'.csv')
    data["Class"] = level
    return data

In [3]:
def LoadFullData(types,subtype):
    if(subtype=="All"):
        frames = [CreateDataFrame(types,"Roof","Clean"), 
              CreateDataFrame(types,"Roof","Dirty (0-20)"), 
              CreateDataFrame(types,"Roof","Dirty (20-40)"), 
              CreateDataFrame(types,"Roof","Dirty (40-60)"),
              CreateDataFrame(types,"Roof","Dirty (60-80)"),
              CreateDataFrame(types,"Roof","Dirty (80-100)"),
              CreateDataFrame(types,"Grass","Clean"), 
              CreateDataFrame(types,"Grass","Dirty (0-20)"), 
              CreateDataFrame(types,"Grass","Dirty (20-40)"), 
              CreateDataFrame(types,"Grass","Dirty (40-60)"),
              CreateDataFrame(types,"Grass","Dirty (60-80)"),
              CreateDataFrame(types,"Grass","Dirty (80-100)")]
    else:
        frames = [CreateDataFrame(types,subtype,"Clean"), 
              CreateDataFrame(types,subtype,"Dirty (0-20)"), 
              CreateDataFrame(types,subtype,"Dirty (20-40)"), 
              CreateDataFrame(types,subtype,"Dirty (40-60)"),
              CreateDataFrame(types,subtype,"Dirty (60-80)"),
              CreateDataFrame(types,subtype,"Dirty (80-100)")]
    df = pd.concat(frames)
    #Randomize
    df = shuffle(df)
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    f = df[['FileAddress']].copy()
    del X['FileAddress']
    X = X['Rmean','Gmean','Bmean']
    return X,y,f

In [4]:
def LoadMinimizedData(types,subtype, count):
    df = pd.read_csv('../Selection/Selection-'+types+"-"+subtype+"-"+count+".csv")
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    f = df[['FileAddress']].copy()
    del X['FileAddress']
    X = X['Rmean','Gmean','Bmean']
    return X,y,f

In [5]:
def loader(types,subtype,datasize):
    if(datasize=="Full"):
        X,y,f = LoadFullData(types,subtype)
    else:
        count = datasize.split('-')[-1]
        X,y,f = LoadMinimizedData(types,subtype,count)
    return X,y,f

# Classifications

## Optimize KNN Classfiers

In [16]:
def optimizeKNN(X_train, y_train,cv):
    param_grid = [{'clf__weights': ["distance"], 'clf__n_neighbors': [5]}]
    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', KNeighborsClassifier(algorithm='brute', 
                               metric='mahalanobis', 
                               metric_params={'V': np.cov(X_train)}))
    ])
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=cv, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    end = time.time()
    return grid_search

## Optimize SVM Classfiers

In [7]:
def optimizeSVM(X_train, y_train,cv):
    param_grid = [
            {'clf__kernel': ['poly'],'clf__degree':[1,5],'clf__coef0':[5,10]}
            #,{'clf__kernel': ['rbf'],'clf__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}
            #,{'clf__kernel': ['sigmoid'],'clf__coef0':[1,5,10,15,20,30]}
            #,{'clf__kernel':['linear'],'clf__C':[1,5,10]}
            ]

    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC())
    ])
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=cv, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Optimize Random Forest Classfiers

In [8]:
def optimizeRandomForest(X_train, y_train,cv):
    param_grid = [
            {'clf__n_estimators': [30],'clf__max_features':['auto']},
            ]

    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier())])
        
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=cv, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Optimize NN Classifiers 

In [9]:
def optimizeNN(X_train, y_train,cv):
    param_grid = [{'clf__hidden_layer_sizes': [(100,50,30,)]}]
    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', MLPClassifier())
    ])
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=cv, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Cycle Work

In [10]:
def cycleWork(X_train, X_test, y_train, y_test,optimize,cv):
    gridsearch = optimize(X_train, y_train,cv)
    traintime = gridsearch.cv_results_['mean_fit_time'].mean()*1000
    print("Mean Fit Time", traintime)
    print("Mean Score Time", gridsearch.cv_results_['mean_score_time'].mean()*1000)

    #print(gridsearch.best_params_)
    estimator = gridsearch.best_estimator_
    params = estimator.get_params()['clf']
    print(params)
    y_train_pred = estimator.predict(X_train)
    trnscore = accuracy_score(y_train, y_train_pred)
    start = time.time()
    y_tst_pred = estimator.predict(X_test)
    end = time.time()
    diff = end-start
    clfperms = X_train.shape[0]/((diff*1000)+0.000000001)
    tstscore = accuracy_score(y_test, y_tst_pred)
    cm = confusion_matrix(y_test, y_tst_pred)
    true_pos = np.diag(cm) 
    precision = np.sum(true_pos / np.sum(cm, axis=0))
    recall = np.sum(true_pos / np.sum(cm, axis=1))
    f1 = 2 * ((precision* recall)/(precision + recall))
    #f1None = f1_score(y_test, y_tst_pred, average=None)
    #f1micro = f1_score(y_test, y_tst_pred, average='micro')
    return [trnscore, tstscore, f1,traintime,clfperms,cm,params,estimator]

In [11]:
def saveresultlist(types,subtype,datasize,clf,iteration,split,classifier,X,y,files):
    data = []
    #print(files.shape[0])
    for i in range(0,files.shape[0]):
        #print(files.iloc[[i]])
        #print(X.iloc[[i]])
        pred = classifier.predict(X.iloc[[i]])
        #print(pred)
        data.append([files.iloc[[i]].values[0][0],y.iloc[[i]].values[0][0],pred[0]])
    numpyarray = np.array(data)
    df = pd.DataFrame(numpyarray, columns = ["File","Y","Pred"])
    fileName = "Real-"+types+'-'+ subtype+"-"+ datasize+"-"+ str(clf)+"-"+ str(iteration)+"-"+ str(split)+'.csv'
    print(type(fileName))
    df.to_csv(fileName,index = False)

In [12]:
def AllWork(types,subtype,sizeDict, cycles = 5, out=False,cross_validation=5):    
    data = []
    for datasize in sizeDict:
        print(datasize)
        classifiersDict = {
                        "KNN":optimizeKNN,
                        #"SVM":optimizeSVM,
                        #"RF":optimizeRandomForest,
                        #"NN":optimizeNN
                          }
        for clf in classifiersDict:
            for i in range(0,cycles):
                X,y,f = loader(types,subtype,datasize)
                #X,y = LoadMinimizedData(types)
                skf = StratifiedKFold(n_splits=cross_validation,shuffle = True)
                j = 0
                for train_index, test_index in skf.split(X, y.values.ravel()):
                    print("Size ",datasize, " Algorithm ",clf, " Trial ",i, " Split ", j," :")
                    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                    acc = cycleWork(X_train, X_test, y_train, y_test,classifiersDict[clf],cross_validation)
                    #saveresultlist(types,subtype,datasize,clf,i,j,acc[7],X,y,f)
                    print("Train Accuracy ",acc[0],", Test Accuracy",acc[1], "F1 ",acc[2])
                    data.append([clf,datasize.split('-')[-1],i,j,acc[0],acc[1],acc[2],acc[3],acc[4],acc[5],acc[6]])
                    j = j+1
                    #break
                    print("########################################")
    numpyarray = np.array(data)
    df = pd.DataFrame(numpyarray, columns = ["Classifier","Count","Cycle","Split","Train Acc", "Test Acc", "F1 Score","Train Time (ms)","clfs per ms","Confusion Matrix","Params"])
    if(out):
        df.to_csv("All-"+types+'-'+subtype+'.csv',index = False)
    df.head(10)

In [18]:
types = "NoBG"
cycles = 1
subtype = "All"
sizeDict = {
                "Full":None,
                #"Min-10":10,
                #"Min-20":20,
                #"Min-30":30,
                #"Min-40":40
                }
AllWork(types,subtype,sizeDict,cycles = cycles, out=True,cross_validation=5)

Full
Size  Full  Algorithm  KNN  Trial  0  Split  0  :
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.2min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min finished


Mean Fit Time 37.97807693481445
Mean Score Time 15476.899194717407
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[379959.94429, 396156.62153, ..., 466322.49659, 302370.11606],
       [396156.62153, 418836.26653, ..., 485060.6558 , 318227.84359],
       ...,
       [466322.49659, 485060.6558 , ..., 573700.86023, 371096.31053],
       [302370.11606, 318227.84359, ..., 371096.31053, 243154.91495]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.34864864864864864 F1  2.0426253379310912
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  1  :
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.3min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.9min finished


Mean Fit Time 53.75947952270508
Mean Score Time 13321.172142028809
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[379959.94429, 323123.18919, ..., 466322.49659, 302370.11606],
       [323123.18919, 283146.8827 , ..., 398778.78974, 254961.24733],
       ...,
       [466322.49659, 398778.78974, ..., 573700.86023, 371096.31053],
       [302370.11606, 254961.24733, ..., 371096.31053, 243154.91495]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')


KeyboardInterrupt: 