# Classification Process

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Importing the Necessary Libraries<br></p>

In [17]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


#Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

import time

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Creating a Pandas DataFrame from a CSV file<br></p>

In [18]:
def CreateDataFrame(types,subtype, level):
    data = pd.read_csv('../Extraction/'+types+"/"+subtype+"/"+level+'.csv')
    data["Class"] = level
    return data

In [19]:
def LoadFullData(types,subtype):
    if(subtype=="All"):
        frames = [CreateDataFrame(types,"Roof","Clean"), 
              CreateDataFrame(types,"Roof","Dirty (0-20)"), 
              CreateDataFrame(types,"Roof","Dirty (20-40)"), 
              CreateDataFrame(types,"Roof","Dirty (40-60)"),
              CreateDataFrame(types,"Roof","Dirty (60-80)"),
              CreateDataFrame(types,"Roof","Dirty (80-100)"),
              CreateDataFrame(types,"Grass","Clean"), 
              CreateDataFrame(types,"Grass","Dirty (0-20)"), 
              CreateDataFrame(types,"Grass","Dirty (20-40)"), 
              CreateDataFrame(types,"Grass","Dirty (40-60)"),
              CreateDataFrame(types,"Grass","Dirty (60-80)"),
              CreateDataFrame(types,"Grass","Dirty (80-100)")]
    else:
        frames = [CreateDataFrame(types,subtype,"Clean"), 
              CreateDataFrame(types,subtype,"Dirty (0-20)"), 
              CreateDataFrame(types,subtype,"Dirty (20-40)"), 
              CreateDataFrame(types,subtype,"Dirty (40-60)"),
              CreateDataFrame(types,subtype,"Dirty (60-80)"),
              CreateDataFrame(types,subtype,"Dirty (80-100)")]
    df = pd.concat(frames)
    #Randomize
    df = shuffle(df)
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    f = df[['FileAddress']].copy()
    del X['FileAddress']
    X = X[['Rmean','Gmean','Bmean']]
    return X,y,f

In [20]:
def LoadMinimizedData(types,subtype, count):
    df = pd.read_csv('../Selection/Selection-'+types+"-"+subtype+"-"+count+".csv")
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    f = df[['FileAddress']].copy()
    del X['FileAddress']
    X = X[['Rmean','Gmean','Bmean']]
    return X,y,f

In [21]:
def loader(types,subtype,datasize):
    if(datasize=="Full"):
        X,y,f = LoadFullData(types,subtype)
    else:
        count = datasize.split('-')[-1]
        X,y,f = LoadMinimizedData(types,subtype,count)
    return X,y,f

# Classifications

## Optimize KNN Classfiers

In [22]:
def optimizeKNN(X_train, y_train,cv):
    param_grid = [{'clf__weights': ["distance"]}]
    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', KNeighborsClassifier(algorithm='brute', 
                               metric='mahalanobis', 
                               metric_params={'V': np.cov(X_train)}))
    ])
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=cv, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    end = time.time()
    return grid_search

## Optimize SVM Classfiers

In [23]:
def optimizeSVM(X_train, y_train,cv):
    param_grid = [
            {'clf__kernel': ['poly'],'clf__degree':[1,5],'clf__coef0':[5,10]}
            #,{'clf__kernel': ['rbf'],'clf__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}
            #,{'clf__kernel': ['sigmoid'],'clf__coef0':[1,5,10,15,20,30]}
            #,{'clf__kernel':['linear'],'clf__C':[1,5,10]}
            ]

    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', SVC())
    ])
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=cv, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Optimize Random Forest Classfiers

In [24]:
def optimizeRandomForest(X_train, y_train,cv):
    param_grid = [
            {'clf__n_estimators': [30],'clf__max_features':['auto']},
            ]

    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier())])
        
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=cv, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Optimize NN Classifiers 

In [25]:
def optimizeNN(X_train, y_train,cv):
    param_grid = [{'clf__hidden_layer_sizes': [(100,50,30,)]}]
    clfpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('clf', MLPClassifier())
    ])
    grid_search = GridSearchCV(clfpipeline, param_grid, cv=cv, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Cycle Work

In [26]:
def cycleWork(X_train, X_test, y_train, y_test,optimize,cv):
    gridsearch = optimize(X_train, y_train,cv)
    traintime = gridsearch.cv_results_['mean_fit_time'].mean()*1000
    print("Mean Fit Time", traintime)
    print("Mean Score Time", gridsearch.cv_results_['mean_score_time'].mean()*1000)

    #print(gridsearch.best_params_)
    estimator = gridsearch.best_estimator_
    params = estimator.get_params()['clf']
    print(params)
    y_train_pred = estimator.predict(X_train)
    trnscore = accuracy_score(y_train, y_train_pred)
    start = time.time()
    y_tst_pred = estimator.predict(X_test)
    end = time.time()
    diff = end-start
    clfperms = X_train.shape[0]/((diff*1000)+0.000000001)
    tstscore = accuracy_score(y_test, y_tst_pred)
    cm = confusion_matrix(y_test, y_tst_pred)
    true_pos = np.diag(cm) 
    precision = np.sum(true_pos / np.sum(cm, axis=0))
    recall = np.sum(true_pos / np.sum(cm, axis=1))
    f1 = 2 * ((precision* recall)/(precision + recall))
    #f1None = f1_score(y_test, y_tst_pred, average=None)
    #f1micro = f1_score(y_test, y_tst_pred, average='micro')
    return [trnscore, tstscore, f1,traintime,clfperms,cm,params,estimator]

In [27]:
def saveresultlist(types,subtype,datasize,clf,iteration,split,classifier,X,y,files):
    data = []
    #print(files.shape[0])
    for i in range(0,files.shape[0]):
        #print(files.iloc[[i]])
        #print(X.iloc[[i]])
        pred = classifier.predict(X.iloc[[i]])
        #print(pred)
        data.append([files.iloc[[i]].values[0][0],y.iloc[[i]].values[0][0],pred[0]])
    numpyarray = np.array(data)
    df = pd.DataFrame(numpyarray, columns = ["File","Y","Pred"])
    fileName = "Real-"+types+'-'+ subtype+"-"+ datasize+"-"+ str(clf)+"-"+ str(iteration)+"-"+ str(split)+'.csv'
    print(type(fileName))
    df.to_csv(fileName,index = False)

In [28]:
def AllWork(types,subtype,sizeDict, cycles = 5, out=False,cross_validation=5):    
    data = []
    for datasize in sizeDict:
        print(datasize)
        classifiersDict = {
                        "KNN":optimizeKNN,
                        #"SVM":optimizeSVM,
                        #"RF":optimizeRandomForest,
                        #"NN":optimizeNN
                          }
        for clf in classifiersDict:
            for i in range(0,cycles):
                X,y,f = loader(types,subtype,datasize)
                #X,y = LoadMinimizedData(types)
                skf = StratifiedKFold(n_splits=cross_validation,shuffle = True)
                j = 0
                for train_index, test_index in skf.split(X, y.values.ravel()):
                    print("Size ",datasize, " Algorithm ",clf, " Trial ",i, " Split ", j," :")
                    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                    acc = cycleWork(X_train, X_test, y_train, y_test,classifiersDict[clf],cross_validation)
                    #saveresultlist(types,subtype,datasize,clf,i,j,acc[7],X,y,f)
                    print("Train Accuracy ",acc[0],", Test Accuracy",acc[1], "F1 ",acc[2])
                    data.append([clf,datasize.split('-')[-1],i,j,acc[0],acc[1],acc[2],acc[3],acc[4],acc[5],acc[6]])
                    j = j+1
                    #break
                    print("########################################")
    numpyarray = np.array(data)
    df = pd.DataFrame(numpyarray, columns = ["Classifier","Count","Cycle","Split","Train Acc", "Test Acc", "F1 Score","Train Time (ms)","clfs per ms","Confusion Matrix","Params"])
    if(out):
        df.to_csv("All-"+types+'-'+subtype+'.csv',index = False)
    df.head(10)

In [29]:
types = "Solar Data"
cycles = 1
subtype = "All"
sizeDict = {
                "Full":None,
                #"Min-10":10,
                #"Min-20":20,
                #"Min-30":30,
                #"Min-40":40
                }
AllWork(types,subtype,sizeDict,cycles = cycles, out=True,cross_validation=10)

Full
Size  Full  Algorithm  KNN  Trial  0  Split  0  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   24.2s remaining:   10.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   26.1s finished


Mean Fit Time 23.435473442077637
Mean Score Time 260.8187198638916
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[13.47928,  3.3869 , ..., 26.40426,  3.52751],
       [ 3.3869 ,  1.15006, ...,  6.97027,  1.20066],
       ...,
       [26.40426,  6.97027, ..., 52.09967,  7.26284],
       [ 3.52751,  1.20066, ...,  7.26284,  1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.7935368043087971 F1  4.765979864682178
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  1  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   12.7s remaining:    5.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   14.6s finished


Mean Fit Time 29.685544967651367
Mean Score Time 251.53770446777347
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 13.47928,   3.3869 , ..., -10.72699,   3.52751],
       [  3.3869 ,   1.15006, ...,  -2.26689,   1.20066],
       ...,
       [-10.72699,  -2.26689, ...,   9.15054,  -2.35692],
       [  3.52751,   1.20066, ...,  -2.35692,   1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.8096947935368043 F1  4.840003012520953
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  2  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   13.2s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.0s finished


Mean Fit Time 28.122878074645996
Mean Score Time 273.4466791152954
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 13.47928,   3.3869 , ..., -10.72699,   3.52751],
       [  3.3869 ,   1.15006, ...,  -2.26689,   1.20066],
       ...,
       [-10.72699,  -2.26689, ...,   9.15054,  -2.35692],
       [  3.52751,   1.20066, ...,  -2.35692,   1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.8061041292639138 F1  4.821743251461538
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  3  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   13.3s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.4s finished


Mean Fit Time 31.248021125793457
Mean Score Time 248.42040538787842
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 1.15006,  3.20444, ..., -2.26689,  1.20066],
       [ 3.20444, 11.44127, ..., -9.7463 ,  3.33831],
       ...,
       [-2.26689, -9.7463 , ...,  9.15054, -2.35692],
       [ 1.20066,  3.33831, ..., -2.35692,  1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.8525179856115108 F1  5.097088400807916
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  4  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   13.2s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.2s finished


Mean Fit Time 23.91829490661621
Mean Score Time 254.20222282409665
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 13.47928,   3.3869 , ..., -10.72699,   3.52751],
       [  3.3869 ,   1.15006, ...,  -2.26689,   1.20066],
       ...,
       [-10.72699,  -2.26689, ...,   9.15054,  -2.35692],
       [  3.52751,   1.20066, ...,  -2.35692,   1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.8432432432432433 F1  5.077178763835903
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  5  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   14.1s remaining:    6.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   16.0s finished


Mean Fit Time 28.37855815887451
Mean Score Time 325.1882553100586
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 13.47928,  12.40461, ..., -10.72699,   3.52751],
       [ 12.40461,  11.44127, ...,  -9.7463 ,   3.33831],
       ...,
       [-10.72699,  -9.7463 , ...,   9.15054,  -2.35692],
       [  3.52751,   3.33831, ...,  -2.35692,   1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.8357400722021661 F1  4.978526660731116
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  6  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   13.8s remaining:    5.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.8s finished


Mean Fit Time 24.996328353881836
Mean Score Time 276.27506256103516
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 13.47928,   3.3869 , ..., -10.72699,   3.52751],
       [  3.3869 ,   1.15006, ...,  -2.26689,   1.20066],
       ...,
       [-10.72699,  -2.26689, ...,   9.15054,  -2.35692],
       [  3.52751,   1.20066, ...,  -2.35692,   1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.8393501805054152 F1  5.018115718304611
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  7  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   12.9s remaining:    5.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.1s finished


Mean Fit Time 20.30951976776123
Mean Score Time 270.5692529678345
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 13.47928,   3.3869 , ..., -10.72699,   3.52751],
       [  3.3869 ,   1.15006, ...,  -2.26689,   1.20066],
       ...,
       [-10.72699,  -2.26689, ...,   9.15054,  -2.35692],
       [  3.52751,   1.20066, ...,  -2.35692,   1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.7992766726943942 F1  4.783289499046911
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  8  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   14.8s remaining:    6.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   16.8s finished


Mean Fit Time 28.123188018798828
Mean Score Time 286.6070508956909
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 13.47928,   3.3869 , ..., -10.72699,   3.52751],
       [  3.3869 ,   1.15006, ...,  -2.26689,   1.20066],
       ...,
       [-10.72699,  -2.26689, ...,   9.15054,  -2.35692],
       [  3.52751,   1.20066, ...,  -2.35692,   1.2535 ]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.8297101449275363 F1  4.9672420000803115
########################################
Size  Full  Algorithm  KNN  Trial  0  Split  9  :
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   13.1s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.0s finished


Mean Fit Time 23.43442440032959
Mean Score Time 253.1073331832886
KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 13.47928,   3.3869 , ...,  26.40426, -10.72699],
       [  3.3869 ,   1.15006, ...,   6.97027,  -2.26689],
       ...,
       [ 26.40426,   6.97027, ...,  52.09967, -20.53184],
       [-10.72699,  -2.26689, ..., -20.53184,   9.15054]])},
           n_jobs=1, n_neighbors=5, p=2, weights='distance')
Train Accuracy  1.0 , Test Accuracy 0.8257713248638838 F1  4.946224211876582
########################################
