# Smart Planning : appreciation model (v1) : all models At once with cross validation technic
 
### Details 

- Problem : `Classification`


- Models : 
    1. scikit Logistic Regression model 
    2. Decision tree
    3. SVM
    4. Random forest 
    5. Stochastic Gradient Descent
    6. Ridge_Regression
    7. KNeighbors classification
    8. Gaussian_Process_Classification
    9. ExtraTreesClassifier
- label : numeric in the interval [0,10]


- Values in percentage to avoid the problem related to a specefic number of classrooms, teachers ...


<br>

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model, metrics, tree, svm, model_selection
import math
from sklearn.ensemble import RandomForestClassifier,  AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

# Get the dataset

In [4]:
df = pd.read_excel("dataset2.xlsx")
df.head()

Unnamed: 0,SC_simultanite_Perc,salles_utilisees_Perc,salles_surutiliseess_Perc,places_videss_Perc,seances_samedis_Perc,Smidis_Perc,IntvDep8Hs_Perc,InterDepDisp6s_Perc,GrpDep8Hs_Perc,GrpDepDisp6s_Perc,SHDispoIntrv_Nbr,SDepDIntvs_Perc,SHPDIntvs_Perc,Label
0,4,0,0,0,2,4,6,2,5,20,1,1,0,1
1,13,10,20,25,20,2,25,6,2,8,10,30,10,1
2,17,100,100,28,0,6,26,6,3,22,0,20,33,0
3,16,70,30,0,1,2,26,10,3,0,4,5,3,1
4,17,12,20,3,0,3,26,8,2,0,22,1,3,1


<br>

# Insights about the dataset

> To get insight : [click here](./DataSetStudy.ipynb)

<br>

# Splitting the dataset

In [5]:

lab = LabelEncoder()

# few data 
trainning_set = df.sample(frac=0.75, random_state=25) 
test_set = df.drop(trainning_set.index)

# Calculate the Z-scores of each column in the training set:
trainning_set_mean = trainning_set.mean()
trainning_set_std = trainning_set.std()
trainning_set = (trainning_set - trainning_set_mean)/trainning_set_std

# Calculate the Z-scores of each column in the testing set:
test_set_mean = test_set.mean()
test_set_std = test_set.std()
test_set = (test_set - test_set_mean)/test_set_std

print("# DataSet shape : ", df.shape)
print("# trainning_set shape : ", trainning_set.shape)
print("# test_set shape  20% : ", test_set.shape)


X_train = trainning_set.iloc[:,0:13]
Y_train = lab.fit_transform( trainning_set[['Label']].values.ravel() )


X_test = test_set.iloc[:,0:13]
Y_test = lab.fit_transform( test_set[['Label']].values.ravel() )


# DataSet shape :  (70, 14)
# trainning_set shape :  (52, 14)
# test_set shape  20% :  (18, 14)


<br>

# Create all models


In [12]:

dfs=[]

models = [
    ('Logistic classification', linear_model.LogisticRegression() ),
    ('Decision tree', tree.DecisionTreeClassifier() ),
    ('SVM classifier', svm.SVC() ),
    ('RandomForestClassifier', RandomForestClassifier( n_estimators=14, n_jobs = 10 ) ),
    ('stochastic gradient descent', linear_model.SGDClassifier(  ) ),
    ('Ridge classifier', linear_model.RidgeClassifier() ),
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=13) ),
    ('GaussianProcessClassifier', GaussianProcessClassifier() ),
    ('AdaBoostClassifier', AdaBoostClassifier() ),
    ('BaggingClassifier', BaggingClassifier() ),
    ('ExtraTreesClassifier', ExtraTreesClassifier( n_estimators=20, n_jobs = 30 ) ),
    ('GradientBoostingClassifier', GradientBoostingClassifier() ),
]


results = []
names = []
target_names = [ i for i in range(10) ] # targets : appreciation 0->10
final = None

for name, model in models:
    kfold = model_selection.KFold(n_splits=2, shuffle=True, random_state=90210)
    cv_results = model_selection.cross_validate(model, X_train, Y_train, cv=kfold)
    clf = model.fit( X_train, Y_train )
    y_pred = clf.predict( X_test)
    results.append(cv_results)
    names.append(name)
    this_df = pd.DataFrame(cv_results)

    this_df['test_m_score'] = round(model.score(X_test, Y_test), 2)
    this_df['test_mae'] = round(metrics.mean_absolute_error(Y_test, y_pred), 4)
    this_df['test_mse']  = round(metrics.mean_squared_error(Y_test, y_pred), 4)
    this_df['model'] = name
    dfs.append(this_df)
final = pd.concat(dfs, ignore_index=True)



In [13]:
final

Unnamed: 0,fit_time,score_time,test_score,test_m_score,test_mae,test_mse,model
0,0.013965,0.000997,0.884615,0.78,0.2222,0.2222,Logistic classification
1,0.011966,0.004987,0.846154,0.78,0.2222,0.2222,Logistic classification
2,0.000997,0.000997,0.730769,0.78,0.2222,0.2222,Decision tree
3,0.000997,0.000997,0.807692,0.78,0.2222,0.2222,Decision tree
4,0.001995,0.000997,0.884615,0.78,0.2222,0.2222,SVM classifier
5,0.001994,0.000998,0.884615,0.78,0.2222,0.2222,SVM classifier
6,1.961354,0.015625,0.769231,0.72,0.2778,0.2778,RandomForestClassifier
7,0.015622,0.015618,0.807692,0.72,0.2778,0.2778,RandomForestClassifier
8,0.0,0.0,0.846154,0.83,0.1667,0.1667,stochastic gradient descent
9,0.0,0.0,0.884615,0.83,0.1667,0.1667,stochastic gradient descent
