# Prediction Using Random Forest

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Importing the Necessary Libraries<br></p>

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

## Plot Includes

In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Creating a Pandas DataFrame from a CSV file<br></p>

In [3]:
def CreateDataFrame(types, level):
    data = pd.read_csv('../Extraction/'+types+"/"+level+'.csv')
    data["Class"] = level
    return data

In [4]:
def LoadData(types):
    frames = [CreateDataFrame(types,"Clean"), 
          CreateDataFrame(types,"Dirty (0-20)"), 
          CreateDataFrame(types,"Dirty (20-40)"), 
          CreateDataFrame(types,"Dirty (40-60)"),
          CreateDataFrame(types,"Dirty (60-80)"),
          CreateDataFrame(types,"Dirty (80-100)")]
    df = pd.concat(frames)
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    return X,y

# Classification using Random Forest

## Optimize Classfiers

In [56]:
def optimize(X_train, y_train):
    param_grid = [
            {'n_estimators': [25,30],'max_features':['auto']},
            ]

    clf = RandomForestClassifier()
    grid_search = GridSearchCV(clf, param_grid, cv=5, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Cycle Work

In [57]:
def cycleWork(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    gridsearch = optimize(X_train, y_train)
    print(gridsearch.best_params_)
    estimator = gridsearch.best_estimator_
    
    for i in range (0,len(estimator.feature_importances_)):
        if(estimator.feature_importances_[i]>0.04):
            print(X.columns[i]," - ",estimator.feature_importances_[i])
    y_train_pred = estimator.predict(X_train)
    trnscore = accuracy_score(y_train, y_train_pred)
    
    y_tst_pred = estimator.predict(X_test)
    tstscore = accuracy_score(y_test, y_tst_pred)
    cm = confusion_matrix(y_test, y_tst_pred)
    true_pos = np.diag(cm) 
    precision = np.sum(true_pos / np.sum(cm, axis=0))
    recall = np.sum(true_pos / np.sum(cm, axis=1))
    f1 = 2 * ((precision* recall)/(precision + recall))
    #f1None = f1_score(y_test, y_tst_pred, average=None)
    #f1micro = f1_score(y_test, y_tst_pred, average='micro')
    return [trnscore, tstscore, f1]

In [58]:
def AllWork(types,cycles = 5):
    data = []
    X,y = LoadData(types)
    for i in range(0,cycles):
        print(i)
        acc = cycleWork(X,y)
        print("Train Accuracy ",acc[0],", Test Accuracy",acc[1], "F1 ",acc[2])
        data.append([acc[0],acc[1],acc[2]])
        numpyarray = np.array(data)
        df = pd.DataFrame(numpyarray, columns = ["Train Acc", "Test Acc", "F1 Score"])
        df.to_csv("RandomForest-"+types+'.csv',index = False)

In [59]:
types = "Solar Data"
AllWork(types,5)
types = "NoBG"
AllWork(types,5)

0
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    9.9s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.4s finished


{'max_features': 'auto', 'n_estimators': 30}
GSkew  -  0.0465360007992
BKurt  -  0.0477431320994
GrayKurt  -  0.0407394093326
BlackRatio  -  0.0560891961743
Train Accuracy  1.0 , Test Accuracy 0.967684021544 F1  5.83675044243
1
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   11.8s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   12.3s finished


{'max_features': 'auto', 'n_estimators': 30}
BKurt  -  0.0433948429789
GraySkew  -  0.0448259545918
BlackRatio  -  0.0443202470931
WhiteRatio  -  0.0542544711641
Train Accuracy  1.0 , Test Accuracy 0.96947935368 F1  5.83316222596
2
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   13.3s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   13.6s finished


{'max_features': 'auto', 'n_estimators': 25}
WhiteRatio  -  0.0681996469643
Correlation0  -  0.044833339645
Train Accuracy  1.0 , Test Accuracy 0.976660682226 F1  5.87076173726
3
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    7.1s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.6s finished


{'max_features': 'auto', 'n_estimators': 30}
RSkew  -  0.0411137700208
GraySkew  -  0.0429067309867
BlackRatio  -  0.0451562524266
WhiteRatio  -  0.0475998297401
Train Accuracy  1.0 , Test Accuracy 0.978456014363 F1  5.89227274244
4
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   11.5s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   12.4s finished


{'max_features': 'auto', 'n_estimators': 25}
AKurt  -  0.052201942106
RSkew  -  0.040726269231
BKurt  -  0.0490185825742
BlackRatio  -  0.0445194463735
WhiteRatio  -  0.0455566418415
Train Accuracy  1.0 , Test Accuracy 0.958707360862 F1  5.79047186587
0
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    8.9s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s finished


{'max_features': 'auto', 'n_estimators': 30}
ASkew  -  0.0539699604957
RSkew  -  0.0667324776317
RKurt  -  0.0526801289078
GSkew  -  0.0803614921376
GKurt  -  0.0463647118235
GraySkew  -  0.059823140936
GrayKurt  -  0.0440304970058
Correlation45  -  0.0476946300423
Train Accuracy  1.0 , Test Accuracy 0.964093357271 F1  5.80567829737
1
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    6.1s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.4s finished


{'max_features': 'auto', 'n_estimators': 25}
RSkew  -  0.13053367637
RKurt  -  0.0417944459587
GSkew  -  0.0660775996046
GKurt  -  0.0532218401228
GraySkew  -  0.0566256281138
Correlation135  -  0.0519910114945
Train Accuracy  1.0 , Test Accuracy 0.967684021544 F1  5.83917607372
2
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    6.3s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.7s finished


{'max_features': 'auto', 'n_estimators': 30}
ASkew  -  0.0406448287751
RSkew  -  0.0928266141029
RKurt  -  0.0594550852969
GSkew  -  0.0805515958995
GKurt  -  0.0426268109068
GraySkew  -  0.0669485194555
GrayKurt  -  0.0414598934088
Correlation90  -  0.0477346947764
Train Accuracy  1.0 , Test Accuracy 0.94973070018 F1  5.68161743825
3
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    6.4s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.9s finished


{'max_features': 'auto', 'n_estimators': 25}
ASkew  -  0.0613536849665
RSkew  -  0.0843634209965
RKurt  -  0.0742612397423
GSkew  -  0.0688713091508
GraySkew  -  0.0615914463952
Train Accuracy  1.0 , Test Accuracy 0.964093357271 F1  5.79260552718
4
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    9.7s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.2s finished


{'max_features': 'auto', 'n_estimators': 30}
RSkew  -  0.0988841489768
RKurt  -  0.0470790441089
GSkew  -  0.0894301085569
GKurt  -  0.0510089064923
GraySkew  -  0.0625525270934
GrayKurt  -  0.0405497640211
Correlation45  -  0.0427414047513
Correlation90  -  0.0400236595002
Train Accuracy  1.0 , Test Accuracy 0.960502692998 F1  5.7825221356
