In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from meteostat import Point, Daily, Stations

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


## Data Wrangling & Preprocessing

In [2]:
df = pd.read_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\international_matches_FINAL.csv')
df_city_locations = pd.read_excel('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\worldcities_excel.xlsx')

In [3]:
df_city_locations.head(2)

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6839,139.7744,Japan,JP,JPN,Tōkyō,primary,39105000.0,1392685764
1,Jakarta,Jakarta,-6.2146,106.8451,Indonesia,ID,IDN,Jakarta,primary,35362000.0,1360771077


In [4]:
# city, city_ascii, country lowercase
df_city_locations['city'] = df_city_locations['city'].str.lower()
df_city_locations['city_ascii'] = df_city_locations['city_ascii'].str.lower()
df_city_locations['country'] = df_city_locations['country'].str.lower()

In [5]:
# extract city_ascii, lat, lng, country, population from df_city_locations
df_city_locations = df_city_locations[['city', 'lat', 'lng', 'population']]

In [6]:
df_city_locations.head(2)

Unnamed: 0,city,lat,lng,population
0,tokyo,35.6839,139.7744,39105000.0
1,jakarta,-6.2146,106.8451,35362000.0


In [7]:
df.head(2)

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_score,...,shoot_out,home_team_result,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score
0,2004-09-03,Spain,Scotland,Europe,Europe,3,67,788,535,1,...,No,Draw,94.0,84.0,86.5,89.3,89.5,80.2,79.7,81.8
1,2004-09-04,Austria,England,Europe,Europe,90,7,488,732,2,...,No,Draw,83.0,88.0,76.2,73.0,74.0,90.5,88.7,91.2


In [8]:
# variance of shoot_out column
# make shoot_out column binary
df['shoot_out'] = df['shoot_out'].apply(lambda x: 1 if x == 'yes' else 0)
df['shoot_out'].var() # no variance so we can drop this column

0.0

In [9]:
# remove variables that may cause leakage 
# remove variables that are not useful for modelling
df.drop(['home_team_score', # leakage
        'away_team_score', # leakage
        #'date', # not useful for modelling
        'shoot_out', # no variance
        'neutral_location', # not useful for this problem
        #'tournament'  # not useful for modelling
        ],
        inplace=True, axis=1)

In [10]:
# drop friendly matches in tournament column
df = df[df['tournament'] != 'Friendly']

In [11]:
# use country and city to find weather data for each match
df['date'] = pd.to_datetime(df['date'])
df[['city','country','tournament','date']].head(3)

Unnamed: 0,city,country,tournament,date
1,Vienna,Austria,FIFA World Cup qualification,2004-09-04
2,Zagreb,Croatia,FIFA World Cup qualification,2004-09-04
3,Reykjavík,Iceland,FIFA World Cup qualification,2004-09-04


In [12]:
# lower case city and country
df['city'] = df['city'].str.lower()
df['country'] = df['country'].str.lower()

In [13]:
# map lat and lng to each match in df
df = df.merge(df_city_locations, how='left', left_on=['city'], right_on=['city'])

In [14]:
df[['city','lat','lng']].head(3)

Unnamed: 0,city,lat,lng
0,vienna,48.2083,16.3725
1,vienna,38.8996,-77.2597
2,vienna,39.324,-81.5383


In [15]:
df.dropna(axis = 0, inplace = True)

In [16]:
df.shape

(4759, 24)

In [17]:
df.columns

Index(['date', 'home_team', 'away_team', 'home_team_continent',
       'away_team_continent', 'home_team_fifa_rank', 'away_team_fifa_rank',
       'home_team_total_fifa_points', 'away_team_total_fifa_points',
       'tournament', 'city', 'country', 'home_team_result',
       'home_team_goalkeeper_score', 'away_team_goalkeeper_score',
       'home_team_mean_defense_score', 'home_team_mean_offense_score',
       'home_team_mean_midfield_score', 'away_team_mean_defense_score',
       'away_team_mean_offense_score', 'away_team_mean_midfield_score', 'lat',
       'lng', 'population'],
      dtype='object')

In [18]:
# get weather data for matches
df['avg_temp'] = 0
for i in range(len(df)):
    df['avg_temp'][i] = Daily(Point(df['lat'][i],df['lng'][i]), start=df['date'][i], end=df['date'][i]).fetch()['tavg']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long long'

In [20]:
# Remove any rows with missing values
df.dropna(axis = 0, inplace = True)

In [21]:
# Remove variables that are not eeded for modelling
df.drop(['date','city','country','tournament','lat','lng'], axis=1, inplace=True)

## Modeling (XGB, Random Forest, Logistic Regression, LGBM)

In [32]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression

In [33]:
y = df['home_team_result']
X = df.drop(['home_team_result'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42,
                                                    stratify=y)

In [34]:
# baseline model
y_train.value_counts(normalize=True)

Win     0.498686
Lose    0.269190
Draw    0.232124
Name: home_team_result, dtype: float64

### Label Encoding for Y

In [35]:
# label encode the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

le.classes_, le.transform(le.classes_)

(array(['Draw', 'Lose', 'Win'], dtype=object), array([0, 1, 2]))

### Make Dummies (One Hot Encoding for X's)

In [36]:
# create dummy variables for categorical variables
train_cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
X_train = pd.get_dummies(X_train, columns=train_cat_cols, drop_first=True)
test_cat_cols = [col for col in X_test.columns if X_test[col].dtype == 'object']
X_test = pd.get_dummies(X_test, columns=test_cat_cols, drop_first=True)

In [37]:
X_train.shape,X_test.shape

((3804, 191), (951, 184))

In [38]:
# fill any columns that are missing in X_train or X_test with 0
for col in set(X_test.columns) - set(X_train.columns):
    X_train[col] = 0

for col in set(X_train.columns) - set(X_test.columns):
    X_test[col] = 0

In [39]:
# make sure the columns are in the same order
X_train = X_train[X_test.columns]

In [42]:
# save X_train, X_test, y_train, y_test
X_train.to_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\X_train.csv', index=False)
X_test.to_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\X_test.csv', index=False)
pd.DataFrame(y_train).to_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\y_test.csv', index=False)

### Scaling

In [30]:
# scaling data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [43]:
# build logistic regression model grid search best recall score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42, max_iter=1000)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'penalty': ['l2','l1']}
grid = GridSearchCV(lr, param_grid, cv=5)
grid.fit(X_train_scaled, y_train)


35 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\shann\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\shann\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\shann\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.57781658        nan 0.577

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=1000, random_state=42),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l2', 'l1']})

In [44]:
# import roc auc score
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Test set score: {:.2f}".format(grid.score(X_test_scaled, y_test)))
# print roc_auc_score with ovr 
print("Test set AUC: {:.2f}".format(roc_auc_score(y_test, grid.predict_proba(X_test_scaled), multi_class='ovr')))

# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, grid.predict(X_test_scaled)))

Best cross-validation score: 0.58
Best parameters:  {'C': 100, 'penalty': 'l2'}
Test set score: 0.59
Test set AUC: 0.73
              precision    recall  f1-score   support

           0       0.50      0.28      0.36       221
           1       0.51      0.48      0.49       256
           2       0.65      0.80      0.71       474

    accuracy                           0.59       951
   macro avg       0.55      0.52      0.52       951
weighted avg       0.57      0.59      0.57       951



In [98]:
# grid search logistic regression for best recall score
from sklearn.model_selection import GridSearchCV

# define model
logreg = LogisticRegression()

# define grid
grid = dict()

# define search
search = GridSearchCV(logreg, grid, scoring='recall_macro', cv=cv, n_jobs=-1)

# perform the search
results = search.fit(X_train_scaled, y_train)

# summarize
print('Best Score: %s' % results.best_score_)

Best Score: 0.5084708503249609


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [99]:
results.best_estimator_

LogisticRegression()

## XGBoost

In [48]:
# grid search logistic regression for best recall score
from sklearn.model_selection import GridSearchCV
# repeated stratified k-fold cross-validation
from sklearn.model_selection import RepeatedStratifiedKFold

# define model
xgb_classifier = XGBClassifier()

# define grid
grid = dict()

# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

# define search
search = GridSearchCV(xgb_classifier, grid, scoring='recall_macro', cv=cv, n_jobs=-1)

# perform the search
results = search.fit(X_train_scaled, y_train)

# summarize
print('Best Score: %s' % results.best_score_)

Best Score: 0.7518883031807889


In [50]:
roc_auc_score(y_test, results.best_estimator_.predict_proba(X_test_scaled),multi_class='ovr')

0.9143680157271624

In [31]:
# build xgboost model
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train_scaled, y_train)

# roc auc score - OVR
print("AUC Score:",roc_auc_score(y_test, xgb.predict_proba(X_test_scaled), multi_class='ovr'))

# classification report
print(classification_report(y_test, xgb.predict(X_test_scaled)))

AUC Score: 0.9143680157271624
              precision    recall  f1-score   support

           0       0.80      0.64      0.71       221
           1       0.75      0.74      0.75       256
           2       0.81      0.89      0.85       474

    accuracy                           0.79       951
   macro avg       0.79      0.76      0.77       951
weighted avg       0.79      0.79      0.79       951



In [87]:
# randomized search cv - xgboost
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.25, 0.5, 1.0],
    'min_child_weight': [1, 3, 5, 7]
}

xgb_cv = XGBClassifier()
xgb_cv = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid,
                            n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

xgb_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=0, gpu_id=-1,
                                           grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_bin=256,...
                                           predictor='auto', random_state=42,
                                           reg_alpha=0, ...)

In [93]:
# save xgb model
xgb_cv.best_estimator_.save_model('xgb_cv_model.json')

In [88]:
# ROC - AUC score
print("AUC Score:",roc_auc_score(y_test, xgb_cv.predict_proba(X_test_scaled), multi_class='ovr'))

# classification report
print(classification_report(y_test, xgb_cv.predict(X_test_scaled)))

AUC Score: 0.9118655883151526
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       377
           1       0.78      0.72      0.75       432
           2       0.81      0.87      0.84       753

    accuracy                           0.78      1562
   macro avg       0.77      0.75      0.76      1562
weighted avg       0.78      0.78      0.78      1562



## Random Forest

In [44]:
# build random forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)

# roc auc score - OVR
print("AUC Score:",roc_auc_score(y_test, rf.predict_proba(X_test_scaled), multi_class='ovr'))

# print classification report
print(classification_report(y_test, rf.predict(X_test_scaled)))

AUC Score: 0.9415639837703739
              precision    recall  f1-score   support

           0       0.86      0.66      0.75       221
           1       0.77      0.81      0.79       256
           2       0.83      0.90      0.86       474

    accuracy                           0.82       951
   macro avg       0.82      0.79      0.80       951
weighted avg       0.82      0.82      0.82       951



In [45]:
# save rf model
import pickle
pickle.dump(rf, open('rf_model.pkl', 'wb'))

In [None]:
# randomized search cv - random forest
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf_cv = RandomForestClassifier()
rf_cv = RandomizedSearchCV(estimator = rf, param_distributions = param_grid,
                            n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]},
                   random_state=42, verbose=2)

In [None]:
# ROC - AUC score
print("AUC Score:",roc_auc_score(y_test, rf_cv.predict_proba(X_test_scaled), multi_class='ovr'))

# classification report
print(classification_report(y_test, rf_cv.predict(X_test_scaled)))


AUC Score: 0.8586917837197444
              precision    recall  f1-score   support

           0       0.97      0.20      0.33       377
           1       0.65      0.64      0.64       432
           2       0.66      0.92      0.77       753

    accuracy                           0.67      1562
   macro avg       0.76      0.59      0.58      1562
weighted avg       0.73      0.67      0.63      1562



## LGBM

In [90]:
from lightgbm import LGBMClassifier

# build lightgbm model
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_scaled, y_train)

# roc auc score - OVR
print("AUC Score:",roc_auc_score(y_test, lgbm.predict_proba(X_test_scaled), multi_class='ovr'))

# classification report
print(classification_report(y_test, lgbm.predict(X_test_scaled)))

AUC Score: 0.8830836292807621
              precision    recall  f1-score   support

           0       0.73      0.54      0.62       377
           1       0.71      0.69      0.70       432
           2       0.76      0.87      0.81       753

    accuracy                           0.74      1562
   macro avg       0.73      0.70      0.71      1562
weighted avg       0.74      0.74      0.73      1562



In [91]:
# randomized search cv - lightgbm
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],  
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7]
}

lgbm_cv = LGBMClassifier()
lgbm_cv = RandomizedSearchCV(estimator = lgbm, param_distributions = param_grid,
                            n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

lgbm_cv.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=LGBMClassifier(random_state=42), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.5, 0.6, 0.7, 0.8,
                                                             0.9, 1.0],
                                        'learning_rate': [0.01, 0.05, 0.1, 0.2,
                                                          0.3],
                                        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
                                        'min_child_weight': [1, 3, 5, 7],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500],
                                        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9,
                                                      1.0]},
                   random_state=42, verbose=2)

In [100]:
# save lgbm model
lgbm_cv.best_estimator_.booster_.save_model('lgbm_cv_model.txt')

<lightgbm.basic.Booster at 0x14638796430>

In [None]:
# ROC - AUC score
print("AUC Score:",roc_auc_score(y_test, lgbm_cv.predict_proba(X_test_scaled), multi_class='ovr'))

# classification report
print(classification_report(y_test, lgbm_cv.predict(X_test_scaled)))

AUC Score: 0.9030019639359516
              precision    recall  f1-score   support

           0       0.71      0.69      0.70       377
           1       0.77      0.73      0.75       432
           2       0.82      0.86      0.84       753

    accuracy                           0.78      1562
   macro avg       0.77      0.76      0.76      1562
weighted avg       0.78      0.78      0.78      1562



## World Cup Predictions

### Importing Group Stage Data and Prepping for Predictions

In [80]:
# load xgb model
import xgboost as xgb
import pandas as pd
import numpy as np
import pickle
import lightgbm as lgb

In [81]:
group_stage = pd.read_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\fifa_worldcup_2022_groupstages.csv')
group_stage_df = pd.read_csv('C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\fifa_worldcup_2022_groupstages.csv')

In [82]:
group_stage_df['city'] = group_stage_df['city'].str.lower()
group_stage_df['date'] = pd.to_datetime(group_stage_df['date'])

In [83]:
# if city == al rayyan, population = 759,000
# if city == doha, population = 2,382,000
# if city == al khor, population = 214,767
# if city == al wakrah, population = 94,272
# if city == lusail, population = 198,600 
# google search: population of qatar cities
group_stage_df['population'] = group_stage_df['city'].map({'al rayyan': 759000, 'doha': 2382000, 
'al khor': 214767, 'al wakrah': 94272, 'lusail': 198600}) 

In [84]:
# import geopy and geolocator
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="GetLoc")

# get latitude and longitude for each city
group_stage_df['lat'] = group_stage_df['city'].apply(lambda x: geolocator.geocode(x).latitude)
group_stage_df['lng'] = group_stage_df['city'].apply(lambda x: geolocator.geocode(x).longitude)

In [85]:
# use lat and long to get weather data for matches
group_stage_df['avg_temp'] = 0
for i in range(len(group_stage_df)):
    group_stage_df['avg_temp'][i] = Daily(Point(group_stage_df['lat'][i],group_stage_df['lng'][i]), 
    start=group_stage_df['date'][i], end=group_stage_df['date'][i]).fetch()['tavg']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [86]:
# impute missing values with mean
group_stage_df['avg_temp'] = group_stage_df['avg_temp'].fillna(group_stage_df['avg_temp'].mean())

In [87]:
# just to check sample rows
group_stage_df.head(2)

Unnamed: 0,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,tournament,...,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,away_team_mean_offense_score,away_team_mean_midfield_score,group,population,lat,lng,avg_temp
0,2022-11-20,Qatar,Ecuador,Asia,South America,50,44,1439.89,1464.39,FIFA World Cup,...,80.3,77.5,73.5,78.333333,76.0,Group A,214767,25.559884,55.56402,24.895745
1,2022-11-21,Senegal,Netherlands,Africa,Europe,18,8,1584.38,1694.51,FIFA World Cup,...,81.333333,79.0,86.5,83.0,84.5,Group A,2382000,25.285633,51.526416,24.4


### Modelling

#### Prepping Dataset

In [88]:
group_stage_df.columns

Index(['date', 'home_team', 'away_team', 'home_team_continent',
       'away_team_continent', 'home_team_fifa_rank', 'away_team_fifa_rank',
       'home_team_total_fifa_points', 'away_team_total_fifa_points',
       'tournament', 'city', 'country', 'neutral_location',
       'home_team_goalkeeper_score', 'away_team_goalkeeper_score',
       'home_team_mean_defense_score', 'home_team_mean_offense_score',
       'home_team_mean_midfield_score', 'away_team_mean_defense_score',
       'away_team_mean_offense_score', 'away_team_mean_midfield_score',
       'group', 'population', 'lat', 'lng', 'avg_temp'],
      dtype='object')

In [94]:
group_stage_df = group_stage_df.sort_values(by=['home_team', 'away_team'])
group_stage_df.reset_index(drop=True, inplace=True)

betting_data = pd.read_csv("C:\\Users\\shann\\Documents\\GitHub\\15095-project\\data\\group_stage_betting_odds_final_cleaned.csv")


# for each row i in group_stage_df, check if the home_team and away_team are the same as in the betting_data
# if not then print row number
for i in range(len(group_stage_df)):
    if group_stage_df['home_team'][i] != betting_data['home_team'][i] or group_stage_df['away_team'][i] != betting_data['away_team'][i]:
        print(i)

In [96]:
# drop columns that are not needed for one hot encoding
group_stage_df.drop(['date','tournament','city','country',
'group','neutral_location','lat','lng'],axis = 1, inplace = True)

In [97]:
# create dummy variables
group_stage_df = pd.get_dummies(group_stage_df)

In [98]:
# find previously removed columns in group_stage_df and set values to 0
for col in set(X_train.columns) - set(group_stage_df.columns):
    group_stage_df[col] = 0

group_stage_df = group_stage_df[X_train.columns]



In [99]:
group_stage_df.reset_index(drop=True, inplace=True)

In [100]:
group_stage_df.head(2)

Unnamed: 0,home_team_fifa_rank,away_team_fifa_rank,home_team_total_fifa_points,away_team_total_fifa_points,home_team_goalkeeper_score,away_team_goalkeeper_score,home_team_mean_defense_score,home_team_mean_offense_score,home_team_mean_midfield_score,away_team_mean_defense_score,...,away_team_continent_Oceania,away_team_continent_South America,home_team_Congo DR,away_team_Belarus,home_team_Cabo Verde,away_team_United Arab Emirates,away_team_Benin,home_team_Togo,home_team_Angola,home_team_India
0,3,13,1773.88,1644.89,84,80,82.25,89.0,84.75,78.5,...,0,0,0,0,0,0,0,0,0,0
1,3,51,1773.88,1437.78,84,70,82.25,89.0,84.75,72.75,...,0,0,0,0,0,0,0,0,0,0


#### Generating Predictions

In [101]:
# scale data
group_stage_df_scaled = scaler.transform(group_stage_df)

In [75]:
# load xgb model
xgb_cv = xgb.Booster()
xgb_cv.load_model('xgb_cv_model.json')
xgb_cv_pred = xgb_cv.predict(xgb.DMatrix(group_stage_df_scaled))
xgb_cv_pred = pd.DataFrame(xgb_cv_pred)
xgb_cv_pred.rename(columns={0:'draw', 1:'home_loss', 2:'home_win'}, inplace=True)
xgb_cv_pred['home_team'] = group_stage['home_team']
xgb_cv_pred['away_team'] = group_stage['away_team']
xgb_cv_pred=xgb_cv_pred[['home_team','away_team','home_win','draw','home_loss']]
# sort new_xgb_pred by home_team and then away_team
xgb_cv_pred = xgb_cv_pred.sort_values(by=['home_team','away_team'])
xgb_cv_pred.to_csv('new_xgb_pred2.csv', index=False)


In [64]:
new_xgb_pred = pd.DataFrame(new_xgb_pred)
new_xgb_pred.columns = ['draw', 'home_loss', 'home_win']
new_xgb_pred['home_team'] = group_stage['home_team']
new_xgb_pred['away_team'] = group_stage['away_team']
new_xgb_pred=new_xgb_pred[['home_team','away_team','home_win','draw','home_loss']]
# sort new_xgb_pred by home_team and then away_team
new_xgb_pred = new_xgb_pred.sort_values(by=['home_team','away_team'])
new_xgb_pred.to_csv('new_xgb_pred.csv', index=False)