In [1]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [2]:
players = pd.read_csv('ao_players.csv')
data = pd.read_csv('data_prepared.csv')

In [5]:
data.columns

Index(['surface', 'tourney_level', 'player_hand', 'player_ht', 'player_age',
       'opponent_hand', 'opponent_ht', 'opponent_age', 'best_of', 'player_ace',
       'player_df', 'player_svpt', 'player_1stIn', 'player_1stWon',
       'player_2ndWon', 'player_SvGms', 'player_bpSaved', 'player_bpFaced',
       'opponent_ace', 'opponent_df', 'opponent_svpt', 'opponent_1stIn',
       'opponent_1stWon', 'opponent_2ndWon', 'opponent_SvGms',
       'opponent_bpSaved', 'opponent_bpFaced', 'player_rank', 'opponent_rank',
       'result'],
      dtype='object')

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
y = data.result
x = data.drop(columns=['result'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, shuffle=True, random_state=42)

In [6]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((15310, 29), (2702, 29), (15310,), (2702,))

models

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

In [9]:
cat_cols = ['surface', 'tourney_level', 'player_hand', 'opponent_hand']
num_cols = [col for col in data.columns if col not in cat_cols and col != 'result']

In [10]:
clf_pipe = Pipeline(
    [
        (
            "col_trf",
            ColumnTransformer(
                transformers=[
                    ("num", StandardScaler(), num_cols),
                    ("cat", OneHotEncoder(sparse_output=False), cat_cols),
                ]
            ),
        ),
        ("clf", DummyClassifier()),
    ]
)

In [11]:
clf_param_grid = [
    {
        "clf": [LogisticRegression(max_iter=300)],
        "clf__tol": [0.0001, 0.001, 0.01],
        "clf__C": [0.1, 0.5, 0.75, 1, 1.25],
    },
    {
        "clf": [SGDClassifier(max_iter=300)],
        "clf__penalty": ["l2", "l1", "elasticnet"],
        "clf__tol": [0.0001, 0.001, 0.01],
        "clf__l1_ratio": [0.1, 0.5, 1],
        "clf__alpha": [0.1, 0.5, 1],
    },
    {
        "clf": [SVC(probability=True)],
        "clf__C": [0.5, 1, 2],
        "clf__tol": [0.0001, 0.001, 0.01],
        "clf__kernel": ["linear", "poly", "rbf", "sigmoid"],
    },
    {
        "clf": [DecisionTreeClassifier()],
        'clf__max_depth': [5, 20, 100, 200, 500],
        'clf__min_samples_split': [2, 5, 20] ,
        'clf__min_samples_leaf': [1, 5, 20] ,
        'clf__max_features': ['auto', 'sqrt', 'log2'],
    },
    {"clf": [KNeighborsClassifier()]},
    {
        "clf": [RandomForestClassifier()],
        'clf__n_estimators': [100, 200, 500, 1_000, 2_000],
        'clf__max_depth': [5, 20, 100, 200, 500],
        'clf__min_samples_split': [2, 5, 20] ,
        'clf__min_samples_leaf': [1, 5, 20],
        'clf__max_features': ['auto', 'sqrt', 'log2'],
    },
]

In [12]:
model = GridSearchCV(
    estimator=clf_pipe,
    param_grid=clf_param_grid,
    scoring='accuracy',
    n_jobs=-1,
    cv=3,
    refit=True,
    error_score='raise',
    verbose=1
)

In [13]:
model.fit(x_train, y_train)

Fitting 3 folds for each of 943 candidates, totalling 2829 fits


In [None]:
model.best_estimator_

In [None]:
model.best_score_, model.best_params_

(0.9516005296124496, {'clf': LogisticRegression(max_iter=300)})

In [None]:
for mean_score, params in zip(model.cv_results_['mean_test_score'], model.cv_results_['params']):
    print(params, mean_score)

{'clf': LogisticRegression(max_iter=300)} 0.9516005296124496
{'clf': SGDClassifier(max_iter=300)} 0.9461788310234458
{'clf': SVC(probability=True)} 0.9356632933154857
{'clf': DecisionTreeClassifier()} 0.8232526464752944
{'clf': KNeighborsClassifier()} 0.7840627278686028
{'clf': RandomForestClassifier()} 0.8820381258918935
