In [41]:
from sklearn import datasets
import pandas as pd
import numpy as np

### Tidy up dataset 

In [23]:
df = pd.read_csv('learnt_data_for_classification.csv')
df = df.dropna()
df['final_result'] = df['final_result'].astype('category')
df.head()

Unnamed: 0,dates,model_RMSE,model_ME,average_satellite,average_rezare,days_since,final_result
0,2018-07-09,195.0,155.0,2346.0,2191.0,2.0,bad
1,2018-07-12,165.0,117.0,2354.0,2237.0,5.0,Sent
2,2018-07-13,123.0,56.0,2301.0,2246.0,1.0,Sent
3,2018-07-14,111.0,11.0,2272.0,2261.0,1.0,Sent
4,2018-07-17,120.0,-27.0,2278.0,2305.0,3.0,Sent


In [24]:
df.describe()

Unnamed: 0,model_RMSE,model_ME,average_satellite,average_rezare,days_since
count,1128.0,1128.0,1128.0,1128.0,1128.0
mean,563.370567,-432.065603,1806.511525,2238.570922,5.91844
std,506.597834,541.880715,718.065423,357.278069,5.035852
min,29.0,-5595.0,-3728.0,1215.0,1.0
25%,272.0,-592.0,1387.25,2017.75,2.0
50%,418.0,-307.5,1981.0,2262.5,4.0
75%,662.25,-105.75,2318.5,2496.5,8.0
max,5912.0,1816.0,4497.0,3242.0,33.0


### Learn model

In [65]:
from sklearn import svm
clf = svm.SVC(kernel='linear', probability=True, gamma = 0.2, C = 1)

In [68]:
y_train = df.final_result
x_train = df[['model_RMSE','model_ME','days_since']]
x_train.head()
y_train.head()

0     bad
1    Sent
2    Sent
3    Sent
4    Sent
Name: final_result, dtype: category
Categories (2, object): [Sent, bad]

In [66]:
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(df[['model_RMSE','model_ME','days_since']],df.final_result,test_size=0.3,random_state=109)
# clf.fit(x_train, y_train)
# y_pred = clf.predict(x_test)
# from sklearn import metrics
# print('Accuracy:', metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.8053097345132744


### Predict model
Note for model selection we want the prediction probability so we run **clf.predict_proba** not **clf.predict(x_train)**, I just run **clf.predict(x_train)** so I can easily check accuracy. 

In [76]:
clf.fit(x_train,y_train)
y_pred = clf.predict_proba(x_train)
#y_pred = clf.predict(x_train)

In [74]:
from sklearn import metrics
print('Accuracy:', metrics.accuracy_score(y_train, y_pred))

Accuracy: 0.8156028368794326


In [79]:
y_pred = pd.DataFrame(y_pred, columns=["sent","bad"])
y_pred = round(y_pred, 3)

In [80]:
y_pred.head()

Unnamed: 0,sent,bad
0,0.661,0.339
1,0.794,0.206
2,0.705,0.295
3,0.705,0.295
4,0.743,0.257


In [81]:
total = pd.concat([df,y_pred], axis= 1)

In [82]:
total.head(30)

Unnamed: 0,dates,model_RMSE,model_ME,average_satellite,average_rezare,days_since,final_result,sent,bad
0,2018-07-09,195.0,155.0,2346.0,2191.0,2.0,bad,0.661,0.339
1,2018-07-12,165.0,117.0,2354.0,2237.0,5.0,Sent,0.794,0.206
2,2018-07-13,123.0,56.0,2301.0,2246.0,1.0,Sent,0.705,0.295
3,2018-07-14,111.0,11.0,2272.0,2261.0,1.0,Sent,0.705,0.295
4,2018-07-17,120.0,-27.0,2278.0,2305.0,3.0,Sent,0.743,0.257
5,2018-07-18,122.0,38.0,2362.0,2324.0,1.0,Sent,0.699,0.301
6,2018-07-19,77.0,-64.0,2282.0,2346.0,1.0,Sent,0.728,0.272
7,2018-07-23,1077.0,-1015.0,988.0,2003.0,4.0,bad,0.0,1.0
8,2018-07-24,1154.0,-1082.0,859.0,1941.0,5.0,bad,0.0,1.0
9,2018-07-25,217.0,-195.0,2179.0,2374.0,6.0,bad,0.616,0.384


### Save/Load model 

In [83]:
from joblib import dump, load 
dump(clf, 'classification_model.joblib')

['classification_model.joblib']

In [86]:
clf = load('classification_model.joblib')

In [87]:
y_pred = clf.predict(x_train)
print('Accuracy:', metrics.accuracy_score(y_train, y_pred))

Accuracy: 0.8156028368794326
