In [1]:
import pandas as pd
final_input = pd.read_csv("../data/processed/final.csv")
print "Dropping na for scores"
final_input.dropna(subset=['score1', 'score2'], inplace=True)
print final_input.head()

Dropping na for scores
   year  month  day  league_id                   league  cntry_att  cntry_def  \
0  2016      8   12       1843           French Ligue 1       2.69       0.54   
1  2016      8   12       1843           French Ligue 1       2.69       0.54   
2  2016      8   13       2411  Barclays Premier League       2.71       0.54   
3  2016      8   13       2411  Barclays Premier League       2.71       0.54   
4  2016      8   13       2411  Barclays Premier League       2.71       0.54   

   cntry_spi  cntry_rank           team1  ... a_gl_dif_5  a_gl_dif_10  \
0      85.77           8          Bastia  ...          0            0   
1      85.77           8       AS Monaco  ...          0            0   
2      86.09           7       Hull City  ...          0            0   
3      86.09           7  Crystal Palace  ...          0            0   
4      86.09           7         Everton  ...          0            0   

   h_gfw_5  h_gfs_5  h_gaw_5  h_gas_5  a_gfw_5  a_g

In [2]:
print "one-hot encode categorical columns: league, team1 and team2"
columns_to_dummy = ['league', 'team1', 'team2']
df = pd.get_dummies(final_input, columns=columns_to_dummy)
print df.head()

one-hot encode categorical columns: league, team1 and team2
   year  month  day  league_id  cntry_att  cntry_def  cntry_spi  cntry_rank  \
0  2016      8   12       1843       2.69       0.54      85.77           8   
1  2016      8   12       1843       2.69       0.54      85.77           8   
2  2016      8   13       2411       2.71       0.54      86.09           7   
3  2016      8   13       2411       2.71       0.54      86.09           7   
4  2016      8   13       2411       2.71       0.54      86.09           7   

    spi1   spi2  ...  team2_Wolverhampton  team2_Wuhan Zall  \
0  51.16  85.68  ...                    0                 0   
1  68.85  56.48  ...                    0                 0   
2  53.57  66.81  ...                    0                 0   
3  55.19  58.66  ...                    0                 0   
4  68.02  73.25  ...                    0                 0   

   team2_Wycombe Wanderers  team2_Yeni Malatyaspor  team2_Yeovil Town  \
0            

In [3]:
print "Standardizing numeric columns"
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])
df_columns = df.columns.to_list()
df_columns.remove('score1')
df_columns.remove('score2')
df[df_columns] = num_pipeline.fit_transform(df[df_columns])
print df.head()

Standardizing numeric columns
       year     month       day  league_id  cntry_att  cntry_def  cntry_spi  \
0 -2.393402  0.344813 -0.439434  -0.410628   0.419121  -0.111318   0.469052   
1 -2.393402  0.344813 -0.439434  -0.410628   0.419121  -0.111318   0.469052   
2 -2.393402  0.344813 -0.326878   0.372279   0.443934  -0.111318   0.487456   
3 -2.393402  0.344813 -0.326878   0.372279   0.443934  -0.111318   0.487456   
4 -2.393402  0.344813 -0.326878   0.372279   0.443934  -0.111318   0.487456   

   cntry_rank      spi1      spi2  ...  team2_Wolverhampton  team2_Wuhan Zall  \
0   -0.438110  0.310132  2.161045  ...            -0.042699         -0.021561   
1   -0.438110  1.256416  0.598076  ...            -0.042699         -0.021561   
2   -0.502062  0.439049  1.151003  ...            -0.042699         -0.021561   
3   -0.502062  0.525707  0.714764  ...            -0.042699         -0.021561   
4   -0.502062  1.212017  1.495713  ...            -0.042699         -0.021561   

   team2

In [5]:
print "splitting training/test set"
from sklearn.model_selection import train_test_split, cross_val_score
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
print train_set.head()

splitting training/test set
           year     month       day  league_id  cntry_att  cntry_def  \
18120  1.144440 -1.444267 -0.664547  -0.406493   0.940191   0.040946   
21393  1.144440 -0.549727 -1.339884  -0.395466  -0.089543  -0.202677   
8021  -0.034841 -1.444267  0.235904  -0.381683   0.046928  -0.019960   
12594 -0.034841  0.642993 -1.677553  -0.267279  -0.461736   0.680458   
15410 -0.034841  1.239353 -0.777103  -0.406493   0.940191   0.040946   

       cntry_spi  cntry_rank      spi1      spi2  ...  team2_Wolverhampton  \
18120   0.642167   -0.757870 -0.846378 -0.266909  ...            -0.042699   
21393   0.246476   -0.182302  0.633227  1.943193  ...            -0.042699   
8021    0.256828   -0.246254  0.052833  0.744203  ...            -0.042699   
12594  -0.388470    0.585121  0.264129 -0.076890  ...            -0.042699   
15410   0.642167   -0.757870 -0.592288 -0.480479  ...            -0.042699   

       team2_Wuhan Zall  team2_Wycombe Wanderers  team2_Yeni Malatyasp