__Första modellutkast - Random Forrest__

In [291]:
# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [292]:
from sklearn.model_selection import GridSearchCV 

In [293]:
# Necessary Sklearn objects used in the analysis
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import preprocessing

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

In [294]:
# Where to save the figures
PROJECT_ROOT_DIR = os.getcwd()
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR)

In [295]:
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [296]:
df1 = pd.read_excel('DataV75TillUffe_2019-02-01_2.xlsx')

In [297]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17994 entries, 0 to 17993
Data columns (total 38 columns):
Datum          17994 non-null datetime64[ns]
Utdelning      17994 non-null int64
Arstid         17994 non-null int64
Distans        17994 non-null int64
Startsatt      17994 non-null int64
Lopp           17994 non-null int64
Plac           17994 non-null int64
Hast           17994 non-null int64
V75PROC        17994 non-null float64
V_ODDS         17994 non-null float64
GRUPP          17994 non-null int64
VLP            17994 non-null float64
VNUM           17994 non-null int64
SVLP           17994 non-null float64
VSVLP          17994 non-null float64
VPN_SUM        17994 non-null float64
VPN_SUM_ORD    17994 non-null int64
VPK_SUM        17994 non-null float64
VPK_SUM_ORD    17994 non-null int64
VLPB           17994 non-null float64
SVLPB          17994 non-null float64
VSVLPB         17994 non-null float64
E_P            17994 non-null float64
E_P_Num        17994 non-null in

In [298]:
df1.Plac.value_counts()

0    16439
1     1555
Name: Plac, dtype: int64

In [299]:
df1.Datum.drop_duplicates().count()

222

In [300]:
222*7

1554

In [301]:
# Skapar en unik nyckel på lopp: Gör om Datum och lopp till en sträng

df1['cdate'] = df1.Datum.astype('object')
df1['cLopp'] = df1.Lopp.astype('object')

In [302]:
df1['Key'] = df1['cdate'].astype(str) + df1['cLopp'].astype(str)

In [303]:
len(df1)

17994

In [304]:
df1.Key.drop_duplicates().count()

1554

__Skapar en målvariabel - vinnare__

In [305]:
df1['Y'] = np.where(df1['Plac'].isin([1]), 1,0)

__Nu plockar vi ut 10 v75 omgångar för att använda dem som test__

In [306]:
v75 = df1.Datum.drop_duplicates().to_frame()

In [307]:
len(v75)

222

In [308]:
v75['is_test']=np.random.uniform(0,1,len(v75))<=0.05

test, basedf = v75[v75['is_test']==True], v75[v75['is_test']==False]

__Skapar en kopia, droppar de variabler som inte ska vara med i analysen samt sätter index__

In [309]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17994 entries, 0 to 17993
Data columns (total 42 columns):
Datum          17994 non-null datetime64[ns]
Utdelning      17994 non-null int64
Arstid         17994 non-null int64
Distans        17994 non-null int64
Startsatt      17994 non-null int64
Lopp           17994 non-null int64
Plac           17994 non-null int64
Hast           17994 non-null int64
V75PROC        17994 non-null float64
V_ODDS         17994 non-null float64
GRUPP          17994 non-null int64
VLP            17994 non-null float64
VNUM           17994 non-null int64
SVLP           17994 non-null float64
VSVLP          17994 non-null float64
VPN_SUM        17994 non-null float64
VPN_SUM_ORD    17994 non-null int64
VPK_SUM        17994 non-null float64
VPK_SUM_ORD    17994 non-null int64
VLPB           17994 non-null float64
SVLPB          17994 non-null float64
VSVLPB         17994 non-null float64
E_P            17994 non-null float64
E_P_Num        17994 non-null in

In [310]:
df2 = df1[(df1.Datum > '2016-05-20') & (df1.Datum.isin(basedf.Datum.tolist()))]. \
drop(['Utdelning','Datum','Arstid','Distans','Startsatt','Lopp','Plac','Hast','cdate','cLopp','VNUM','V_ODDS','S_R', \
      'V75PROC'], axis = 1). \
copy().set_index(['Key'])

In [311]:
test_df =  df1[(df1.Datum > '2016-05-20') & (df1.Datum.isin(test.Datum.tolist()))]. \
drop(['Utdelning','Arstid','Distans','Startsatt','Lopp','Plac','Hast','cdate','cLopp','VNUM','V_ODDS','S_R', \
      'V75PROC'], axis = 1). \
copy().set_index(['Key'])

In [312]:
len(df2)

16953

In [313]:
len(test_df)

1041

__Gör om GRUPP till objekt för att kunna använda befintlig dummylogik__

In [314]:
df2['GRUPP'] = df2.GRUPP.astype('object')

AttributeError: 'DataFrame' object has no attribute 'GRUPP'

In [None]:
df2.info()

__Delar upp i numeriska samt charachter attribut. Det är dessa som går in i modellen__

In [None]:
num_attribs = []
cat_attribs = []
for var, typ in zip(df2.columns[:-1], df2.dtypes[:-1]):
    if typ == 'object':
        cat_attribs.append(var)
    else:
        num_attribs.append(var)       

In [None]:
cat_attribs

In [None]:
num_attribs

In [None]:
len(num_attribs)

In [None]:
name_list = []
tempdf = pd.get_dummies(df1.GRUPP)
for val in tempdf.columns:
    name_list.append('GRUPP'+str(val))
        
name_list    

In [None]:
features_list = num_attribs + name_list
features_list

__Nu bygger vi upp en pipeline__

In [None]:
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
# Denna klass måste vi göra för att särskilja numeriska variabler mot character variabler
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [None]:
# Egen klass för att sätta dummyvariabler

class SetDummyVar(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        tempdf = pd.get_dummies(X[self.attribute_names], columns = self.attribute_names)
        return tempdf.values

In [None]:
# Pipeline för numeriska variabler
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median"))
    ])

cat_pipeline = Pipeline([
        ('dummy_cat', SetDummyVar(cat_attribs)),
    ])

In [None]:
full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

### In order to avoid overfitting data is split into training data and validation data. 75% träningsdata och 25% valideringsdata

In [None]:
df2['is_train']=np.random.uniform(0,1,len(df2))<=0.75

train, validate = df2[df2['is_train']==True], df2[df2['is_train']==False]

In [None]:
type(train)

### Nu skapar vi arrayer som ska användas av modellen - använder den skapade Pipelineobjektet som gör alla nödvändiga dataransformationer 

In [None]:
# Träningssdata
# Alla förklaringsvaribler i en multidimensionell array där kategrisvaribler har gjorts om till
# dummyvariabler
features_train = full_pipeline.fit_transform(train)
## En array som håller det vi vill predikter
label_train = train["Y"].copy()

In [None]:
# Valideringsdata
# Alla förklaringsvaribler i en multidimensionell array där kategrisvaribler har gjorts om till
# dummyvariabler
features_validate = full_pipeline.fit_transform(validate)
## En array som håller det vi vill predikter
label_validate = validate["Y"].copy()

# Train the model on the training data and then evaluate on the validation data

- predicts: A twodimensional array that contains posterrior probabbility for a donation behaviour, one for non-donation and one for donation
- fpr : False positive rate, number of false positive for a specific threshold value
- tpr : True positive rate, number of true positive for a specific threshold value
- threshold: Sorted threshold (descending) values for the likelihod to donate
- roc_auc: Receiver operating characteristics. A value close to 1 indicates a strong model. A value close to 0.5 means that the model is rather poor

In [None]:
# Bygger random forrest och analyserar roc_auc
# Instansierar modellen
rf = RandomForestClassifier(n_estimators = 1000, max_depth=5)

# Tränar modellen

rf.fit(features_train,label_train)

In [None]:
rf.n_features_

In [None]:
len(rf.feature_importances_)

In [None]:
rf.feature_importances_

In [None]:
weight_list = []
for name, weight in zip(features_list, rf.feature_importances_):
    weight_dict = {'Feature': name, 'Weight': weight}
    weight_list.append(weight_dict)

In [None]:
analys_weight = pd.DataFrame(weight_list).sort_values('Weight', ascending = False)

analys_weight

In [None]:
analys_weight.to_excel('Weight.xlsx', index = False)

In [None]:
# Predikterar med modellen med valideringsdata

predict = rf.predict_proba(features_validate)

fpr, tpr, threshold = roc_curve(label_validate,predict[:,1])

roc_auc = auc(fpr,tpr)

In [None]:
fpr_ser = pd.Series(fpr)
tpr_ser = pd.Series(tpr)
threshold_ser = pd.Series(threshold)

__Vi gör en dataframe av fpr, tpr, threshold för att förstå relationerna__

In [None]:
# Exempel
df = pd.concat([fpr_ser, tpr_ser,threshold_ser], axis=1).rename(columns = {0: 'fpr', 1:'tpr',2:'threshold'})

In [None]:
# Median värde för den sannolikhet som genereras på varje enskild observation
prob = pd.Series(predict[:,1])
prob.median()

In [None]:
prob.max()

In [None]:
fig, (ax1,ax2) = plt.subplots(2,1,figsize=(14,8), linewidth=5, edgecolor='.5')


ax1.hist(prob, bins = 50)
ax2.hist(threshold, bins = 50)

plt.show()

In [None]:
df.threshold.median()

__Varför ger threshold och prob olika medianvärden?__

# A ROC curve for the charity classifier on the charity data

__It traces out two types of error as we vary the threshold value for the posterior probability of charity. The actual thresholds are not shown. The true positive rate is the sensitivity: the fraction of givers that are correctly identified, using a given threshold value. The false positive rate is 1-specificity: the fraction of non-givers that we classify incorrectly as givers, using that same threshold value. The ideal ROC curve hugs the top left corner, indicating a high true positive rate and a low false positive rate. The dotted line represents the “no information” classifier.__ 

In [None]:
# Graf

plt.title('Receiver Operating Characteristic') 
plt.plot( fpr, tpr, 'b', label =' AUC = %0.3f' % roc_auc) 
plt.legend( loc ='lower right') 
plt.plot([ 0, 1], [0, 1], 'r--') 
plt.xlim([ 0.0, 1.0]) 
plt.ylim([ 0.0, 1.0]) 
plt.ylabel('True positive rate') 
plt.xlabel('False positive rate') 
save_fig('ROC')
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(label_validate, prob)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([0, 1])
save_fig("precision_recall_vs_threshold_plot")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

In [None]:
 
y_truth = label_validate
confusion_matrix(label_validate, y_truth)

In [None]:
prob.median()

In [None]:
df.threshold.median()

In [None]:
# Vi väljer medianvärdet som cut off 
y_pred = np.where( prob > prob.median(),1,0)
 
confusion_matrix(label_validate, y_pred)

In [None]:
346 / (1764 + 346)

In [None]:
# Precision - Av de vi predikterar som vinnare, hur bra är vi
precision_score(label_validate, y_pred)

In [None]:
# True positive rate = Den totala andelen av sanna postiva vid en given cut of
recall_score(label_validate, y_pred)
 

In [None]:
label_validate_ser = pd.Series(label_validate.tolist())
prob_ser = pd.Series(prob)

In [None]:
df_valid = pd.concat([label_validate_ser, prob_ser], axis = 1).rename(columns = {0:'Y',1 :'Prob'}). \
sort_values('Prob')

__Delar in i deciler på den framräknade scoren__

In [None]:
data = df_valid.Prob.values
quartiles = pd.qcut(data, 10)
pd.value_counts(quartiles)

In [None]:
df_valid.Y.sum()

In [None]:
grouped = df_valid.Y.groupby(quartiles)

In [None]:
grouped.count()

__Beräknar den procentuella andelen vinnare i respektice decil__

In [None]:
resp = round((grouped.sum() / grouped.count() * 100))
resp

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16,12))

#df2[['VLP']].hist(bins=50, ax = ax1, label = 'Variansen på V75PROC - VLP')

df_valid.Prob.plot(kind = 'hist', bins = 100, title = 'Probability response', ax = ax1)


resp.plot(kind = 'bar', color=(0.2, 0.4, 0.6, 1),title = 'Andel vinnare per decil', ax = ax2)

save_fig('Probdist')

In [None]:
y_pred = np.where(prob >=0.174,1,0)
 
confusion_matrix(label_validate, y_pred)

In [None]:
 
precision_score(label_validate, y_pred)

In [None]:
# True positive rate = Den totala andelen av sanna postiva vid en given cut of
recall_score(label_validate, y_pred)
 

In [None]:
y_pred = np.where(prob >=0.131,1,0)
 
confusion_matrix(label_validate, y_pred)

In [None]:
precision_score(label_validate, y_pred)

In [None]:
# True positive rate = Den totala andelen av sanna postiva vid en given cut of
recall_score(label_validate, y_pred)

In [None]:
# I df ligger roc statistiken. Plockar ut raden där 0.75 < tpr < 0.8. Vad är threshols
df.head()

In [None]:
f1 = df.tpr > 0.60
f2 = df.tpr < 0.62

In [None]:
df[f1 & f2]

__Nu har vi modellen. Nu vill vi testa på våra undanlagda 9 omgångar för att se hur det lirar. Skapar testdatasetet och kör sedan igenom varje omgång. Flaggar upp alla över score 0.135 som vinnar och räknar sedan ut hur det fallit ut__

In [None]:
test_df.Datum.drop_duplicates()

In [None]:
test_df[test_df.Datum == '2018-11-25']

In [None]:
# Plockar ut en omgång och scorar denna 

df_test1 = test_df[test_df.Datum == '2018-11-25'].drop('Datum', axis = 1)


In [None]:
len(df_test1.columns)

In [None]:
df_test1['GRUPP'] = df_test1.GRUPP.astype('object')

In [None]:
df_test1.info()

In [None]:
df_test1.Y.sum()

In [None]:
# Testdata som modllen ska scoras på
# Alla förklaringsvaribler i en multidimensionell array där kategrisvaribler har gjorts om till
# dummyvariabler
features_test = full_pipeline.fit_transform(df_test1)
## En array som håller det vi vill predikter
label_test = df_test1["Y"].copy()

In [None]:
predict_test = rf.predict_proba(features_test)

In [None]:
y_pred_test = np.where( predict_test[:,1] > 0.135,1,0)
 
confusion_matrix(label_test, y_pred_test)

In [None]:
y_pred_test = np.where( predict_test[:,1] > 0.184,1,0)
 
confusion_matrix(label_test, y_pred_test)

__Nu itererar vi över alla testloppen__

In [None]:
def validering(Date):
    df_test1 = test_df[test_df.Datum == Date].drop('Datum', axis = 1)
    df_test1['GRUPP'] = df_test1.GRUPP.astype('object')
    features_test = full_pipeline.fit_transform(df_test1)
    label_test = df_test1["Y"].copy()
    predict_test = rf.predict_proba(features_test)
    y_pred_test = np.where( predict_test[:,1] > 0.135,1,0)
    confusion_matrix(label_test, y_pred_test)

In [None]:
df_tmp = test_df.Datum.drop_duplicates().to_frame().reset_index()
df_tmp

In [None]:
d_list = ['2017-08-16','2017-08-23','2018-11-11','2018-11-25']
for date in d_list:
    df_test1 = test_df[test_df.Datum == date].drop('Datum', axis = 1)
    df_test1['GRUPP'] = df_test1.GRUPP.astype('object')
    features_test = full_pipeline.fit_transform(df_test1)
    label_test = df_test1["Y"].copy()
    predict_test = rf.predict_proba(features_test)
    y_pred_test = np.where( predict_test[:,1] > 0.135,1,0)
    print(confusion_matrix(label_test, y_pred_test),date)

In [None]:
d_list = ['2017-08-16','2017-08-23','2018-11-11','2018-11-25']
for date in d_list:
    df_test1 = test_df[test_df.Datum == date].drop('Datum', axis = 1)
    df_test1['GRUPP'] = df_test1.GRUPP.astype('object')
    features_test = full_pipeline.fit_transform(df_test1)
    label_test = df_test1["Y"].copy()
    predict_test = rf.predict_proba(features_test)
    y_pred_test = np.where( predict_test[:,1] > 0.184,1,0)
    print(confusion_matrix(label_test, y_pred_test),date)
    

__Testar Åby 20190202__

In [None]:
df_aby = pd.read_excel('Åby2019-02-02.xlsx')

In [None]:
df_aby.info()

In [None]:
df_aby['Y'] = np.where(df_aby['Plac'].isin([1]), 1,0)

In [None]:
df_aby.Y.sum()

In [None]:
date = '2019-02-02'
df_aby['GRUPP'] = df_aby.GRUPP.astype('object')
features_test = full_pipeline.fit_transform(df_aby)
label_test = df_aby["Y"].copy()
predict_test = rf.predict_proba(features_test)
y_pred_test = np.where( predict_test[:,1] > 0.131,1,0)
print(confusion_matrix(label_test, y_pred_test),date)

In [None]:
date = '2019-02-02'
y_pred_test = np.where( predict_test[:,1] > 0.184,1,0)
print(confusion_matrix(label_test, y_pred_test),date)

In [None]:
type(rf)

__Axvalla 2019-02-16__

In [None]:
df_axv = pd.read_excel('AxevallaDataV75TillUffe_2019-02-16.xlsx')

In [None]:
df_axv.Plac.value_counts()

In [None]:
df_axv['Y'] = np.where(df_axv['Plac'].isin([1]), 1,0)

In [None]:
date = '2019-02-16'
df_axv['GRUPP'] = df_axv.GRUPP.astype('object')
features_test = full_pipeline.fit_transform(df_axv)
label_test = df_axv["Y"].copy()
predict_test = rf.predict_proba(features_test)
y_pred_test = np.where( predict_test[:,1] > 0.089,1,0)
print(confusion_matrix(label_test, y_pred_test),date)

In [None]:
# Gör om scorad array till en lista - multidimensionell
stack = predict_test.tolist()

In [None]:
# Sedan plockar vi den scorade sannolikheten att vara vinnare - 1
# Transformerar denna sannolikhet till en dataframe
last = []
for x in stack:
    last.append(x[1])
scored = {'Score':last}
# Konverterar till Dataframe
df_scored = pd.DataFrame.from_dict(scored)

In [None]:
# Gör om till lista och stoppar in i ett dictionary
pred = y_pred_test.tolist()
prediction = {'Predict':pred}
# Konverterar till Dataframe
df_pred = pd.DataFrame.from_dict(prediction)

In [None]:
# Konkatierar mot urspringsdatat

df_axv_predict = pd.concat([df_axv,df_pred, df_scored], axis = 1)

In [None]:
df_axv_out = df_axv_predict[df_axv_predict.Predict == 1][['Lopp','Plac','Hast','Score']]. \
sort_values(['Lopp','Score'], ascending = [True,False])
df_axv_out

In [None]:
date = '2019-02-16'
y_pred_test = np.where( predict_test[:,1] > 0.131,1,0)
print(confusion_matrix(label_test, y_pred_test),date)

In [None]:
date = '2019-02-16'
y_pred_test = np.where( predict_test[:,1] > 0.184,1,0)
print(confusion_matrix(label_test, y_pred_test),date)

__Sparar undan pipelineobjekt och modellobjekt för att kunna återanvänd senare__

In [None]:
from sklearn.externals import joblib

In [None]:
# Pipelineobjekt
joblib.dump(full_pipeline, 'Pipeline_v1.pkl')


In [None]:
# Modellobjekt
joblib.dump(rf, 'Travmodel_v1.pkl')

__Nu testar vi att läsa in modell- och pipelinebjekten__

In [None]:
my_model_loaded = joblib.load('Travmodel_v1.pkl')

In [None]:
my_pipeline_loaded = joblib.load('Pipeline_v1.pkl')

In [None]:
date = '2019-02-02'
df_aby['GRUPP'] = df_aby.GRUPP.astype('object')
features_test = my_pipeline_loaded.fit_transform(df_aby)
label_test = df_aby["Y"].copy()
predict_test = my_model_loaded.predict_proba(features_test)
y_pred_test = np.where( predict_test[:,1] > 0.131,1,0)
print(confusion_matrix(label_test, y_pred_test),date)

__Nedan försöker vi optimera modellen med Gridsearch samt använder sig av hela datasetet med Cross validation__

In [None]:
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

In [None]:
# Allt data
# Alla förklaringsvaribler i en multidimensionell array där kategrisvaribler har gjorts om till
# dummyvariabler
features = full_pipeline.fit_transform(df2)
## En array som håller det vi vill predikter
labels = df2["Y"].copy()

In [None]:
rf_all = RandomForestClassifier()

In [None]:
GridSearchCV?

In [None]:
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(rf_all, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(features, labels)