# Classification Process

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Importing the Necessary Libraries<br></p>

In [96]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances

#Classifiers
from sklearn.neighbors import NearestCentroid

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

import time

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Creating a Pandas DataFrame from a CSV file<br></p>

In [110]:
def CreateTrain():
    df = pd.read_csv('Yap.csv')
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    X  = X[['Amean','Rmean','Gmean','Bmean']]
    return X,y

In [111]:
def CreateDataFrame(types,subtype, level):
    data = pd.read_csv('../Extraction/'+types+"/"+subtype+"/"+level+'.csv')
    data["Class"] = level
    return data

In [112]:
def LoadFullData(types,subtype):
    if(subtype=="All"):
        frames = [CreateDataFrame(types,"Roof","Clean"), 
              CreateDataFrame(types,"Roof","Dirty (0-20)"), 
              CreateDataFrame(types,"Roof","Dirty (20-40)"), 
              CreateDataFrame(types,"Roof","Dirty (40-60)"),
              CreateDataFrame(types,"Roof","Dirty (60-80)"),
              CreateDataFrame(types,"Roof","Dirty (80-100)"),
              CreateDataFrame(types,"Grass","Clean"), 
              CreateDataFrame(types,"Grass","Dirty (0-20)"), 
              CreateDataFrame(types,"Grass","Dirty (20-40)"), 
              CreateDataFrame(types,"Grass","Dirty (40-60)"),
              CreateDataFrame(types,"Grass","Dirty (60-80)"),
              CreateDataFrame(types,"Grass","Dirty (80-100)")]
    else:
        frames = [CreateDataFrame(types,subtype,"Clean"), 
              CreateDataFrame(types,subtype,"Dirty (0-20)"), 
              CreateDataFrame(types,subtype,"Dirty (20-40)"), 
              CreateDataFrame(types,subtype,"Dirty (40-60)"),
              CreateDataFrame(types,subtype,"Dirty (60-80)"),
              CreateDataFrame(types,subtype,"Dirty (80-100)")]
    df = pd.concat(frames)
    #Randomize
    df = shuffle(df)
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    f = df[['FileAddress']].copy()
    del X['FileAddress']
    X  = X[['Amean','Rmean','Gmean','Bmean']]
    return X,y,f

In [100]:
def LoadMinimizedData(types,subtype, count):
    df = pd.read_csv('../Selection/Selection-'+types+"-"+subtype+"-"+count+".csv")
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    f = df[['FileAddress']].copy()
    del X['FileAddress']
    return X,y,f

In [101]:
def loader(types,subtype,datasize):
    if(datasize=="Full"):
        X,y,f = LoadFullData(types,subtype)
    else:
        count = datasize.split('-')[-1]
        X,y,f = LoadMinimizedData(types,subtype,count)
    return X,y,f

# Classifications

## Optimize NC Classfiers

In [102]:
def optimizeNC(X_train, X_test, y_train, y_test):
    y_test_pred = []
    for i in range(0,X_test.shape[0]) :
        dmin = euclidean_distances(X_test.iloc[i].reshape(1,-1), X_train.iloc[0].reshape(1,-1))
        cmin = y_train.iloc[0].values[0]
        for j in range (1,6):
            d = euclidean_distances(X_test.iloc[i].reshape(1,-1), X_train.iloc[j].reshape(1,-1))
            #print(d)
            if d<dmin :
                dmin = d
                cmin = y_train.iloc[j].values[0]
        #print(cmin)
        #print(y_test.iloc[i].values[0])
        y_test_pred.append(cmin)
    return y_test_pred

## Cycle Work

In [103]:
def cycleWork(X_train, X_test, y_train, y_test,optimize,cv):
    traintime = 0
    print("Mean Fit Time", traintime)
    print("Mean Score Time", 0)

    #print(gridsearch.best_params_)
    estimator = None
    params = None
    print(params)
    #y_train_pred = estimator.predict(X_train)
    trnscore = 0
    start = time.time()
    y_tst_pred = optimize(X_train, X_test, y_train, y_test)
    end = time.time()
    #print(y_tst_pred)
    diff = end-start
    clfperms = X_train.shape[0]/((diff*1000)+0.000000001)
    tstscore = accuracy_score(y_test, y_tst_pred)
    cm = confusion_matrix(y_test, y_tst_pred)
    true_pos = np.diag(cm) 
    precision = np.sum(true_pos / np.sum(cm, axis=0))
    recall = np.sum(true_pos / np.sum(cm, axis=1))
    f1 = 2 * ((precision* recall)/(precision + recall))
    #f1None = f1_score(y_test, y_tst_pred, average=None)
    #f1micro = f1_score(y_test, y_tst_pred, average='micro')
    return [trnscore, tstscore, f1,traintime,clfperms,cm,params,estimator]

In [104]:
def AllWork(types,subtype,sizeDict, out=False,cross_validation=5):    
    data = []
    for datasize in sizeDict:
        print(datasize)
        classifiersDict = {
                        "NC":optimizeNC,
                          }
        for clf in classifiersDict:
            X_test,y_test,f = loader(types,subtype,datasize)
            #X,y = LoadMinimizedData(types)
            X_train,y_train = CreateTrain()
            acc = cycleWork(X_train, X_test, y_train, y_test,classifiersDict[clf],cross_validation)
            print("Train Accuracy ",acc[0],", Test Accuracy",acc[1], "F1 ",acc[2])
            data.append([clf,datasize.split('-')[-1],0,0,acc[0],acc[1],acc[2],acc[3],acc[4],acc[5],acc[6]])
            print("########################################")
    numpyarray = np.array(data)
    df = pd.DataFrame(numpyarray, columns = ["Classifier","Count","Cycle","Split","Train Acc", "Test Acc", "F1 Score","Train Time (ms)","clfs per ms","Confusion Matrix","Params"])
    if(out):
        df.to_csv("All-"+types+'-'+subtype+'.csv',index = False)
    df.head(10)

In [113]:
types = "NoBG"
cycles = 1
subtype = "Roof"
sizeDict = {
                "Full":None,
                #"Min-10":10,
                #"Min-20":20,
                #"Min-30":30,
                #"Min-40":40
                }
AllWork(types,subtype,sizeDict, out=True,cross_validation=5)

Full
Mean Fit Time 0

  after removing the cwd from sys.path.
  import sys



Mean Score Time 0
None
Train Accuracy  0 , Test Accuracy 0.14439655172413793 F1  nan
########################################


