__Adderar nytt data för att uppdatera modellen. Nytt data från 20190820 och framåt__


__Tar fram en modell med logistisk regression. En för data innan 2019 och en för 2019 för att se om något förändrats i data. Det ser vi genom att titta på modelparametrar__

In [1]:
# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [2]:
from sklearn.model_selection import GridSearchCV 

In [3]:
# Necessary Sklearn objects used in the analysis
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import preprocessing

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score

In [4]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split 
import sklearn.preprocessing as preproc 

In [5]:
# Where to save the figures
PROJECT_ROOT_DIR = os.getcwd()
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR)

In [6]:
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [17]:
df0 = pd.read_excel('DataV75Outtake20190820.xlsx')

In [18]:
# Skapar en unik nyckel på lopp: Gör om Datum och lopp till en sträng

df0['cdate'] = df0.Datum.astype('object')
df0['cLopp'] = df0.Lopp.astype('object')

In [19]:
df0['Key'] = df0['cdate'].astype(str) + df0['cLopp'].astype(str)

__Skapar en målvariabel - vinnare__

In [20]:
df0['Y'] = np.where(df0['Plac'].isin([1]), 1,0)

__Plockar bort de variabler som inte ska med__

In [21]:
# Alla analysvariabler
df1 = df0.copy(deep = True).set_index(['Key'])


In [13]:
# Plockar bort alla vnum 1-3, de ska med i uttaget
# Plockar bort hästara i VNUM (1,2,3) som inte ska vara med
df1 = df1[~df1.VNUM.isin([1,2,3])]    

__Plockar bort de variabler som inte ska med__

In [24]:
# Alla analysvariabler
df1 = df1.drop(['Plac','cdate','cLopp','TK_R','Arstid','Startsatt', 'VNUM', 'GRUPP', 'Distans',
               'cLopp','cdate','V75PROC', 'V_ODDS',  'SP_R','E_R'], axis = 1)

In [25]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21897 entries, 2016-05-21 00:00:001 to 2019-08-14 00:00:007
Data columns (total 26 columns):
Datum          21897 non-null datetime64[ns]
Lopp           21897 non-null int64
Hast           21897 non-null int64
VLP            21897 non-null float64
SVLP           21897 non-null float64
VSVLP          21897 non-null float64
VPN_SUM        21897 non-null float64
VPN_SUM_ORD    21897 non-null int64
VPK_SUM        21897 non-null float64
VPK_SUM_ORD    21601 non-null float64
VLPB           21897 non-null float64
SVLPB          21897 non-null float64
VSVLPB         21897 non-null float64
E_P            21897 non-null float64
E_P_Num        21897 non-null int64
E_N            21897 non-null float64
E_U            20856 non-null float64
G_R            20061 non-null float64
A_R            18580 non-null float64
T_R            20220 non-null float64
ToR            19034 non-null float64
P_R            19501 non-null float64
Ex_R           13490 non-nu

In [26]:
num_attribs = [] 
cat_attribs = [] 

for var, typ in zip(df1.columns[:-1], df1.dtypes[:-1]): 
    if typ == 'object': 
        cat_attribs.append(var) 
    elif (typ != 'datetime64[ns]')  & (var != 'Hast') & (var != 'Lopp'): 
        num_attribs.append(var)

In [27]:
cat_attribs 

[]

In [28]:
num_attribs 

['VLP',
 'SVLP',
 'VSVLP',
 'VPN_SUM',
 'VPN_SUM_ORD',
 'VPK_SUM',
 'VPK_SUM_ORD',
 'VLPB',
 'SVLPB',
 'VSVLPB',
 'E_P',
 'E_P_Num',
 'E_N',
 'E_U',
 'G_R',
 'A_R',
 'T_R',
 'ToR',
 'P_R',
 'Ex_R',
 'R_R',
 'Ts_R']

__Nu bygger vi upp en pipeline__

In [29]:
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
# Denna klass måste vi göra för att särskilja numeriska variabler mot character variabler
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# Egen klass för att sätta dummyvariabler

class SetDummyVar(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        tempdf = pd.get_dummies(X[self.attribute_names], columns = self.attribute_names)
        return tempdf.values

# Pipeline för numeriska variabler
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median"))
    ])

cat_pipeline = Pipeline([
        ('dummy_cat', SetDummyVar(cat_attribs)),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
    ])



In [30]:
df2 = df1.copy(deep = True)    

__Bygger modellen__

In [32]:

features = full_pipeline.fit_transform(df2)
## En array som håller det vi vill predikter
label = df2["Y"].copy()

In [33]:
from sklearn.linear_model import LogisticRegression

In [35]:
log_clf = LogisticRegression(solver="liblinear", C = 0.1, random_state=42)

# Utvärderar styrkan i modellen - sätter hyperparametrarna och cross fold fem
scores_opt_spik = cross_val_score(log_clf, features,label, scoring = "roc_auc", cv = 5 ) 

scores_opt_spik

array([0.77639685, 0.71764316, 0.77316319, 0.7677373 , 0.76020347])

In [36]:
scores_opt_spik.mean()

0.7590287947687103

## Bygger modellen

In [37]:
model_2019 = log_clf.fit(features,label)

log_clf.fit(features,label)
predict19 = log_clf.predict_proba(features)

fpr, tpr, threshold = roc_curve(label,predict19[:,1])
roc_auc = auc(fpr,tpr)
print(roc_auc)

0.7610574923115534


In [38]:
c_list_19 = model_2019.coef_.tolist()

import math

for var,par in zip(num_attribs, c_list_19[0]):
    OddsRatio = math.exp(float(par))
    print(var +':', str(round(par,3)) + ':', round(OddsRatio,2))

VLP: 0.004: 1.0
SVLP: -0.005: 0.99
VSVLP: -0.097: 0.91
VPN_SUM: 0.336: 1.4
VPN_SUM_ORD: -0.053: 0.95
VPK_SUM: 0.304: 1.36
VPK_SUM_ORD: -0.051: 0.95
VLPB: 0.007: 1.01
SVLPB: -0.041: 0.96
VSVLPB: 0.021: 1.02
E_P: -0.249: 0.78
E_P_Num: -0.057: 0.94
E_N: -0.006: 0.99
E_U: -0.008: 0.99
G_R: 0.158: 1.17
A_R: 0.138: 1.15
T_R: 0.073: 1.08
ToR: 0.044: 1.05
P_R: 0.122: 1.13
Ex_R: 0.031: 1.03
R_R: 0.068: 1.07
Ts_R: 0.107: 1.11


In [None]:
['VPN_SUM','VPK_SUM','E_P','G_R','A_R','P_R','Ts_R']

In [None]:
# Nu tar vi enbart med de som har en ODDSkvot under 0.9 samt över 1.1 och bygger en modell med alla interaktioner

In [68]:
features = ['VPN_SUM','VPK_SUM','E_P','G_R','A_R','P_R','Ts_R']

In [69]:
len(features)

7

In [70]:
num_attribs = features 

In [42]:
# Bygger en ny pipeline på de valda variblerna
# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
# Denna klass måste vi göra för att särskilja numeriska variabler mot character variabler
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

# Egen klass för att sätta dummyvariabler

class SetDummyVar(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        tempdf = pd.get_dummies(X[self.attribute_names], columns = self.attribute_names)
        return tempdf.values

# Pipeline för numeriska variabler
num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median"))
    ])

cat_pipeline = Pipeline([
        ('dummy_cat', SetDummyVar(cat_attribs)),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
    ])



In [71]:

features_red = full_pipeline.fit_transform(df2)
## En array som håller det vi vill predikter
label_red = df2["Y"].copy()

In [44]:
X2 = preproc.PolynomialFeatures(include_bias = False).fit_transform(features_red) 

In [45]:
X2.shape

(21897, 35)

In [72]:
log_clf = LogisticRegression(solver="liblinear", C = 0.1, random_state=42)

# Utvärderar styrkan i modellen - sätter hyperparametrarna och cross fold fem
scores_opt_red = cross_val_score(log_clf, features_red,label_red, scoring = "roc_auc", cv = 5 ) 

scores_opt_red

array([0.77561219, 0.71318904, 0.76680235, 0.76575962, 0.75670566])

In [73]:
scores_opt_red.mean()

0.7556137728971769

In [74]:
model_2019_red = log_clf.fit(features_red,label_red)

log_clf.fit(features_red,label_red)
predict19_red = log_clf.predict_proba(features_red)

fpr, tpr, threshold = roc_curve(label_red,predict19_red[:,1])
roc_auc = auc(fpr,tpr)
print(roc_auc)

0.7560124988000232


In [75]:
c_list_19_red = model_2019_red.coef_.tolist()

In [76]:
num_attribs

['VPN_SUM', 'VPK_SUM', 'E_P', 'G_R', 'A_R', 'P_R', 'Ts_R']

In [77]:
rank = []
parlist = []
oddsr = []
for var, par in zip(num_attribs, c_list_19_red[0]):
    OddsRatio = math.exp(float(par))
    rank.append(var)
    parlist.append(par)
    oddsr.append(round(OddsRatio,2)) 
    
dictlist = {'Rank':rank, 'Parameter': parlist, 'Oddskvot': oddsr}
# Konverterar till Dataframe
df_val_19 = pd.DataFrame.from_dict(dictlist)
df_val_19


Unnamed: 0,Rank,Parameter,Oddskvot
0,VPN_SUM,0.755332,2.13
1,VPK_SUM,0.958573,2.61
2,E_P,-0.349813,0.7
3,G_R,0.177293,1.19
4,A_R,0.191708,1.21
5,P_R,0.171846,1.19
6,Ts_R,0.186135,1.2


In [79]:
from sklearn.externals import joblib

#Pipelineobjekt
joblib.dump(full_pipeline, 'Pipeline_v9.pkl')

# Modellobjekt
joblib.dump(model_2019_red, 'Travmodel_v9.pkl')

['Travmodel_v9.pkl']