# Prediction Using SVM

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Importing the Necessary Libraries<br></p>

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import time

## Plot Includes

In [6]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

<p style="font-family: Arial; font-size:1.75em;color:purple; font-style:bold"><br>

Creating a Pandas DataFrame from a CSV file<br></p>

In [7]:
def CreateDataFrame(types, level):
    data = pd.read_csv('../Extraction/'+types+"/"+level+'.csv')
    data["Class"] = level
    return data

In [8]:
def LoadData(types):
    frames = [CreateDataFrame(types,"Clean"), 
          CreateDataFrame(types,"Dirty (0-20)"), 
          CreateDataFrame(types,"Dirty (20-40)"), 
          CreateDataFrame(types,"Dirty (40-60)"),
          CreateDataFrame(types,"Dirty (60-80)"),
          CreateDataFrame(types,"Dirty (80-100)")]
    df = pd.concat(frames)
    y=df[['Class']].copy()
    X = df.copy()
    del X['Class']
    return X,y

# Classification using Support Vector Machine

## Optimize Classfiers

In [9]:
def optimize(X_train, y_train):
    param_grid = [
            {'svc__kernel': ['poly'],'svc__degree':[1,2,3,4,5],'svc__coef0':[1,5,10,15,20,30]},
            {'svc__kernel': ['rbf'],'svc__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]}
            #,{'svc__kernel': ['sigmoid'],'svc__coef0':[1,5,10,15,20,30]}
            ]

    svcpipeline = Pipeline([
        ('scale', StandardScaler()),
        ('svc', SVC())
    ])
    grid_search = GridSearchCV(svcpipeline, param_grid, cv=5, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train.values.ravel())
    return grid_search

## Cycle Work

In [10]:
def cycleWork(X_train, X_test, y_train, y_test):
    gridsearch = optimize(X_train, y_train)
    print("Mean Fit Time", gridsearch.cv_results_['mean_fit_time'].mean()*1000)
    print("Mean Score Time", gridsearch.cv_results_['mean_score_time'].mean()*1000)

    print(gridsearch.best_params_)
    estimator = gridsearch.best_estimator_
    
    y_train_pred = estimator.predict(X_train)
    trnscore = accuracy_score(y_train, y_train_pred)
    start = time.time()
    y_tst_pred = estimator.predict(X_test)
    end = time.time()
    diff = end-start
    print(diff*1000)
    tstscore = accuracy_score(y_test, y_tst_pred)
    cm = confusion_matrix(y_test, y_tst_pred)
    true_pos = np.diag(cm) 
    precision = np.sum(true_pos / np.sum(cm, axis=0))
    recall = np.sum(true_pos / np.sum(cm, axis=1))
    f1 = 2 * ((precision* recall)/(precision + recall))
    #f1None = f1_score(y_test, y_tst_pred, average=None)
    #f1micro = f1_score(y_test, y_tst_pred, average='micro')
    return [trnscore, tstscore, f1]

In [11]:
def AllWork(types,cycles = 5):
    print(types)
    data = []
    X,y = LoadData(types)
    #X,y = LoadMinimizedData(types)
    skf = StratifiedKFold(n_splits=5,shuffle = True)
    for train_index, test_index in skf.split(X, y.values.ravel()):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        acc = cycleWork(X_train, X_test, y_train, y_test)
        print("Train Accuracy ",acc[0],", Test Accuracy",acc[1], "F1 ",acc[2])
        data.append([acc[0],acc[1],acc[2]])
        numpyarray = np.array(data)
        df = pd.DataFrame(numpyarray, columns = ["Train Acc", "Test Acc", "F1 Score"])
        df.to_csv("KNN-"+types+'.csv',index = False)
        df.head(10)

In [12]:
types = "Solar Data"
AllWork(types,5)
#types = "NoBG"
#AllWork(types,5)

Solar Data
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   15.8s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   37.2s finished


Mean Fit Time 338.962893259
Mean Score Time 48.6057951337
{'svc__coef0': 15, 'svc__degree': 2, 'svc__kernel': 'poly'}
31.979799270629883
Train Accuracy  0.960431654676 , Test Accuracy 0.957142857143 F1  5.76001559681
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   40.2s finished


Mean Fit Time 369.878742808
Mean Score Time 52.9223476137
{'svc__coef0': 30, 'svc__degree': 2, 'svc__kernel': 'poly'}
35.97903251647949
Train Accuracy  0.96585804133 , Test Accuracy 0.964157706093 F1  5.79930375409
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   40.6s finished


Mean Fit Time 371.696681068
Mean Score Time 51.2898354303
{'svc__coef0': 1, 'svc__degree': 5, 'svc__kernel': 'poly'}
25.983810424804688
Train Accuracy  0.995507637017 , Test Accuracy 0.978494623656 F1  5.89762747037
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   40.7s finished


Mean Fit Time 370.240362485
Mean Score Time 52.955591111
{'svc__coef0': 5, 'svc__degree': 5, 'svc__kernel': 'poly'}
17.9901123046875
Train Accuracy  1.0 , Test Accuracy 1.0 F1  6.0
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   46.0s finished


Mean Fit Time 432.842599778
Mean Score Time 59.2852501642
{'svc__coef0': 20, 'svc__degree': 2, 'svc__kernel': 'poly'}
26.983261108398438
Train Accuracy  0.964573991031 , Test Accuracy 0.945848375451 F1  5.69968712967
