In [63]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split,GridSearchCV , StratifiedKFold
from sklearn.metrics import f1_score , confusion_matrix , classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Preprocessing

In [2]:
dataset = pd.read_csv('main_data.csv',index_col=0)
features = dataset.columns.tolist()
features.remove('target')
home_features = features[:19]
away_features = features[19:]

In [26]:
features_19 = [feature[:-4]+'diff' for feature in home_features]
def dataset19(df):
    dataset_19 = pd.DataFrame(df[home_features].values - df[away_features].values,columns=features_19)
    dataset_19['target'] = df['target']
    return dataset_19

In [34]:
data_19_feature = dataset19(dataset)

In [31]:
random_state = 1

In [36]:
def split_data(df):
    X = df.drop('target',axis=1)
    y = df['target']
    X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=random_state,stratify=y.values,shuffle=True)
    return X_train , X_test , y_train , y_test

In [37]:
X_train_38 , X_test_38 , y_train_38 , y_test_38 = split_data(dataset)
X_train_19 , X_test_19 , y_train_19 , y_test_19 = split_data(data_19_feature)
print("X_train_38 shape is:" , X_train_38.shape)
print("y_train_38 shape is:" , y_train_38.shape)
print("X_test_38 shape is:" , X_test_38.shape)
print("y_test_38 shape is:" , y_test_38.shape)
print("X_train_19 shape is:" , X_train_19.shape)
print("y_train_19 shape is:" , y_train_19.shape)
print("X_test_19 shape is:" , X_test_19.shape)
print("y_test_19 shape is:" , y_test_19.shape)

X_train_38 shape is: (6968, 38)
y_train_38 shape is: (6968,)
X_test_38 shape is: (1743, 38)
y_test_38 shape is: (1743,)
X_train_19 shape is: (6968, 19)
y_train_19 shape is: (6968,)
X_test_19 shape is: (1743, 19)
y_test_19 shape is: (1743,)


In [50]:
def preprocessing(train,test):
    scaler = StandardScaler()
    X_preprocessed = pd.DataFrame(scaler.fit_transform(train),columns=train.columns)
    test_preprocessed = pd.DataFrame(scaler.transform(test),columns=test.columns)
    return X_preprocessed,test_preprocessed

In [56]:
X_train_38_preprocessed , X_test_38_preprocessed = preprocessing(X_train_38,X_test_38)
X_train_19_preprocessed , X_test_19_preprocessed = preprocessing(X_train_19,X_test_19)

# Modeling

In [64]:
logistic_regression = LogisticRegression(random_state=random_state)
XGC = XGBClassifier(random_state=random_state)
SVC = SVC(random_state=random_state)
models = {
    'logistic_regression': logistic_regression,
    'SVC': SVC,
    'XGC': XGC,
}

In [65]:
def evaluation(model,x_train,y_train,x_test,y_test):
    model.fit(x_train, y_train)
    y_predicted = model.predict(x_test)
    f1 = f1_score(y_test, y_predicted, average='weighted')
    print('The f1 score is ',f1)
    print(confusion_matrix(y_true=y_test, y_pred=y_predicted), classification_report(y_true=y_test, y_pred=y_predicted))

In [67]:
for name,model in models.items():
    print(name)
    print('-*' * 20 + ' Using 38 features ' + '-*' * 20)
    evaluation(model,X_train_38_preprocessed,y_train_38,X_test_38_preprocessed,y_test_38)
    print('-*' * 20 + ' Using 19 features ' + '-*' * 20)
    evaluation(model,X_train_19_preprocessed,y_train_19,X_test_19_preprocessed,y_test_19)

logistic_regression
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* Using 38 features -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
The f1 score is  0.4296153434231748
[[282 250   8]
 [149 588  13]
 [153 295   5]]               precision    recall  f1-score   support

         0.0       0.48      0.52      0.50       540
         1.0       0.52      0.78      0.62       750
         2.0       0.19      0.01      0.02       453

    accuracy                           0.50      1743
   macro avg       0.40      0.44      0.38      1743
weighted avg       0.42      0.50      0.43      1743

-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* Using 19 features -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
The f1 score is  0.4288095786217167
[[291 249   0]
 [154 595   1]
 [156 297   0]]               precision    recall  f1-score   support

         0.0       0.48      0.54      0.51       540
         1.0       0.52      0.79      0.63       750
         2.0       0.00      0.00      0.00       453

    accuracy     