# Nonlinear classifiers
Try with nonlinear classifiers, can you do better than the baseline models from above?

In [1]:
import importlib; import cours_proj4_funcs; 
importlib.reload(cours_proj4_funcs)
from cours_proj4_funcs import f_read_npzF, f_addIdxFeat, f_fileApp

In [2]:
import numpy as np
import os

X_tr_ini, y_tr_ini, X_data_tr, fnames_tr, y_cat_tr = f_read_npzF(os.path.join('data', 'data_train.npz'))
X_tr_ini1 = f_addIdxFeat(X_tr_ini)

X_va_ini, y_va_ini, X_data_va, fnames_va, y_cat_va = f_read_npzF(os.path.join('data', 'data_valid.npz'))
X_va_ini1 = f_addIdxFeat(X_va_ini)

X_te_ini, y_te_ini, X_data_te, fnames_te, y_cat_te = f_read_npzF(os.path.join('data', 'data_test.npz'))
X_te_ini1 = f_addIdxFeat(X_te_ini)

#### - Try with a random Forest, does increasing the number of trees help?

In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

gs_results_rf = []

for n_est in [1, 10, 100, 200]: #g in [0.1, 100, 1000, 'auto']: #[0.1, 1, 10]
    for m_depth in [1, 3, 5, None]:
        #print (g)
        rfc = RandomForestClassifier(n_estimators=n_est, max_depth=m_depth, random_state=0)
        # Fit estimator
        rfc.fit(X_tr_ini, y_tr_ini);
        
        gs_results_rf.append({ "n_estimators": n_est
                               ,"max_depth" : m_depth
                               ,"test score": rfc.score(X_te_ini, y_te_ini) })

# Convert results to DataFrame
gs_results_rf = pd.DataFrame(gs_results_rf)
gs_results_rf.sort_values("test score", ascending=False)

Unnamed: 0,max_depth,n_estimators,test score
14,5.0,200,0.94
10,5.0,100,0.92
15,,200,0.92
11,,100,0.9
6,5.0,10,0.86
7,,10,0.8
9,3.0,100,0.78
13,3.0,200,0.78
5,3.0,10,0.74
3,,1,0.64


From what we see above, for **Random forest** with `tree=3`, the best test accuracy is `0.78`. We can also also say that increasing the number of trees, i.e to `5`, we get better score.

In [4]:
# Random forest accuracy to append into the file 
f_fileApp('data/results09.csv',3,"random forest", 0.78)

#### - Try with SVM Linear

In [5]:
from sklearn.svm import LinearSVC

# Create SVM with linear kernel
linear_svc = LinearSVC()

# Fit estimator
linear_svc.fit(X_tr_ini, y_tr_ini);

test_acc_lsvc = linear_svc.score(X_te_ini, y_te_ini)

print("Linear SVC test accuracy: {}".format(test_acc_lsvc))
f_fileApp('data/results09.csv',4,"svm linear", test_acc_lsvc)

Linear SVC test accuracy: 0.92


#### does the SVM RBF kernel perform better than the linear one?

In [6]:
from sklearn.svm import SVC

gs_results_svm = []

for c in [0.1, 1, 3, 5, 10]:
    for g in [0.1, 10, 100, 1000, 'auto']:
        #print (g)
        rbf_svc_cg = SVC(kernel='rbf', C=c, gamma=g)
        # Fit estimator
        rbf_svc_cg.fit(X_tr_ini, y_tr_ini);
        
        gs_results_svm.append({ "C": c
                               ,"gamma" : g
                               ,"test score": rbf_svc_cg.score(X_te_ini, y_te_ini) })

In [7]:
# Convert results to DataFrame
gs_results_svm = pd.DataFrame(gs_results_svm)
gs_results_svm.sort_values("test score", ascending=False).head(5)

Unnamed: 0,C,gamma,test score
14,3.0,auto,0.96
9,1.0,auto,0.96
24,10.0,auto,0.94
19,5.0,auto,0.94
4,0.1,auto,0.56


With the parameters `C=3` and `gamma='auto'`, we get a score of `0.96`. So that with the parameters mentioned, it seems to be better than the linear one

In [8]:
# Random forest accuracy to append into the file 
f_fileApp('data/results09.csv',5,"vm rbf", 0.96)