In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore')

## Read Train Data

In [2]:
# Read training inputs
train_home_team_statistics_df = pd.read_csv('../Train_Data/train_home_team_statistics_df.csv', index_col=0)
train_away_team_statistics_df = pd.read_csv('../Train_Data/train_away_team_statistics_df.csv', index_col=0)

# Read training labels
train_labels = pd.read_csv('../Train_Data/Y_train.csv', index_col=0)

# Remove league and team name, we want to generalize the results
train_home = train_home_team_statistics_df.iloc[:,2:]
train_away = train_away_team_statistics_df.iloc[:,2:]

# Designate home and away columns
train_home.columns = 'HOME_' + train_home.columns
train_away.columns = 'AWAY_' + train_away.columns

# Concatenate home and away data
train_data =  pd.concat([train_home, train_away], join='inner', axis=1)
train_data = train_data.replace({np.inf:np.nan, -np.inf:np.nan})

# Revert combine home-win, draw, and away-win columns into one
train_labels = train_labels.loc[train_data.index]
train_labels = train_labels.values.tolist()

In [3]:
# Read test set
test_home = pd.read_csv('../Test_Data/test_home_team_statistics_df.csv', index_col=0)
test_away = pd.read_csv('../Test_Data/test_away_team_statistics_df.csv', index_col=0)

# Read public test labels
test_labels = pd.read_csv('../Test_Data/Y_test.csv', index_col=0)

# Change column names
test_home.columns = 'HOME_' + test_home.columns
test_away.columns = 'AWAY_' + test_away.columns

# Basic data cleaning
test_data =  pd.concat([test_home,test_away], join='inner', axis=1)
test_data = test_data.replace({np.inf:np.nan, -np.inf:np.nan})

# Revert combine home-win, draw, and away-win columns into one
test_labels = test_labels.loc[test_data.index]
test_labels = test_labels.values.tolist()

## Problem Description
The target score is the accuracy of prediction for the vector `[HOME_WINS, DRAW, AWAY_WINS]`, for which there are three possible choices, `[1,0,0]`, `[0,1,0]`, and `[0,0,1]`. The train-test split has been provided from the original problem data package.

In [4]:
# 80-20 Train and Test Split
X_train, X_test, y_train, y_test = train_data, test_data, train_labels, test_labels

## Multi-Layer Perceptron Model

In [5]:
# Prepare data preprocessing pipeline
# Impute NaNs with mean, as the classifier can't handle missing inputs
# Use standard scaler, as recommended by the MLPClassifier documentation
numeric_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
  ])

# Create a column transformer with the previous steps
preprocessor = ColumnTransformer(
    transformers=[
         ('preprocess', numeric_transformer, X_train.columns)
    ])

# Create a pipeline object
PIPELINE = Pipeline(steps=[('preprocessor', preprocessor), ('estimator', MLPClassifier())])

# List for hyperparameter search
GRID = [
    {
     'estimator': [MLPClassifier(random_state=1)],
     'estimator__solver': ['adam'],
     'estimator__learning_rate_init': [0.0001],
     'estimator__max_iter': [300],
     'estimator__hidden_layer_sizes': [(5, 3, 2, 1)],
     'estimator__activation': ['relu', 'lbfgs'],
     'estimator__alpha': [0.1, 0.001, 0.0001, 0.00001],
     'estimator__early_stopping': [True]
     }
]

# Run grid search to look for the best parameters
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, 
                            scoring="accuracy", 
                            n_jobs=-1, cv=10, refit=True, verbose=10, 
                            return_train_score=False)

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV 5/10; 1/8] START estimator=MLPClassifier(random_state=1), estimator__activation=relu, estimator__alpha=0.1, estimator__early_stopping=True, estimator__hidden_layer_sizes=(5, 3, 2, 1), estimator__learning_rate_init=0.0001, estimator__max_iter=300, estimator__solver=adam
[CV 2/10; 1/8] START estimator=MLPClassifier(random_state=1), estimator__activation=relu, estimator__alpha=0.1, estimator__early_stopping=True, estimator__hidden_layer_sizes=(5, 3, 2, 1), estimator__learning_rate_init=0.0001, estimator__max_iter=300, estimator__solver=adam
[CV 3/10; 1/8] START estimator=MLPClassifier(random_state=1), estimator__activation=relu, estimator__alpha=0.1, estimator__early_stopping=True, estimator__hidden_layer_sizes=(5, 3, 2, 1), estimator__learning_rate_init=0.0001, estimator__max_iter=300, estimator__solver=adam
[CV 7/10; 1/8] START estimator=MLPClassifier(random_state=1), estimator__activation=relu, estimator__alpha=0.1, estim

## Accuracy on Test Set

In [6]:
print(f"The accuracy of the model is {grid_search.score(X_test, y_test)}")
print(grid_search.best_params_)

The accuracy of the model is 0.33660517187007255
{'estimator': MLPClassifier(random_state=1), 'estimator__activation': 'relu', 'estimator__alpha': 0.0001, 'estimator__early_stopping': True, 'estimator__hidden_layer_sizes': (5, 3, 2, 1), 'estimator__learning_rate_init': 0.0001, 'estimator__max_iter': 300, 'estimator__solver': 'adam'}


## Submission

In [7]:
# Make predictions, ordered by row
submission = pd.DataFrame(grid_search.predict(X_test))
submission = submission.rename(columns={0: 'HOME_WINS', 1: 'DRAW', 2: 'AWAY_WINS'})
submission.index = X_test.index

# Save the submission file
submission.to_csv('./submission.csv', index=True)