In [44]:
import pandas as pd
import seaborn as sns
import sklearn
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import torch
import pytorch_lightning as pl

In [46]:
features = ["Team A Kills", "Team A Errors", "Team A Total Attacks", "Team A Hit Pct", "Team A Assists", "Team A Aces", "Team A SErr", "Team A Digs", "Team A RErr", "Team A Block Solos", "Team A Block Assists", "Team A BErr", "Team A PTS", "Team B Kills", "Team B Errors", "Team B Total Attacks", "Team B Hit Pct", "Team B Assists", "Team B Aces", "Team B SErr", "Team B Digs", "Team B RErr", "Team B Block Solos", "Team B Block Assists", "Team B BErr", "Team B PTS"]

matches_gathered_df = pd.read_csv('../../../data/ncaa/processed/2019/accumulated/matches_gathered.csv')
matches_gathered_X, matches_gathered_y = matches_gathered_df[features], matches_gathered_df["Result"]

sma_df = pd.read_csv('../../../data/ncaa/processed/2019/accumulated/10_sma.csv')
sma_X, sma_y = sma_df[features], sma_df["Result"]

cma_df = pd.read_csv( '../../../data/ncaa/processed/2019/accumulated/cma.csv')
cma_X, cma_y = cma_df[features], cma_df["Result"]

ewm_df = pd.read_csv('../../../data/ncaa/processed/2019/accumulated/0.2_ewm.csv')
ewm_X, ewm_y = ewm_df[features], ewm_df["Result"]

# Evaluating models with default parameters and all featuress

In [47]:
data_dict = {
    "matches_gathered": (matches_gathered_X, matches_gathered_y),
    "sma": (sma_X, sma_y),
    "cma": (cma_X, cma_y),
    "ewm": (ewm_X, ewm_y),
}

for name, (X, y) in data_dict.items():
    print(f"\n{name} -")
    X_scaled = StandardScaler().fit_transform(X)
    print(f"X.shape: {X.shape}, y.shape: {y.shape}")

    models = [
        XGBClassifier(),
        MLPClassifier(),
        RandomForestClassifier(),
        LogisticRegression(),
        SVC(),
        DecisionTreeClassifier(),
    ]
    result_dict = {"Model": [], "Accuracy Mean": [], "Accuracy Dev": []}
    for m in models:
        scores = cross_val_score(m, X_scaled, y, scoring="accuracy", cv=StratifiedKFold(), n_jobs=-1)
        result_dict["Model"].append(m.__class__.__name__)
        result_dict["Accuracy Mean"].append(scores.mean())
        result_dict["Accuracy Dev"].append(scores.std())

    results_df = pd.DataFrame(result_dict)
    print(f"Result on {name} -")
    print(results_df)
    print("-" * 50)



matches_gathered -
X.shape: (9536, 26), y.shape: (9536,)
Result on matches_gathered -
                    Model  Accuracy Mean  Accuracy Dev
0           XGBClassifier       0.916946      0.006083
1           MLPClassifier       0.921035      0.011356
2  RandomForestClassifier       0.913800      0.008220
3      LogisticRegression       0.933934      0.005869
4                     SVC       0.925440      0.007496
5  DecisionTreeClassifier       0.874265      0.010522
--------------------------------------------------

sma -
X.shape: (8891, 26), y.shape: (8891,)
Result on sma -
                    Model  Accuracy Mean  Accuracy Dev
0           XGBClassifier       0.739060      0.011357
1           MLPClassifier       0.736474      0.007186
2  RandomForestClassifier       0.749296      0.011533
3      LogisticRegression       0.761669      0.009273
4                     SVC       0.751433      0.007157
5  DecisionTreeClassifier       0.658756      0.006790
-------------------------------

In [48]:
np.mean(cross_val_score(LogisticRegression() , ewm_X, ewm_y, scoring='accuracy', cv=StratifiedKFold(), n_jobs=-1))

0.8002461538850645

In [52]:
np.mean(cross_val_score(MLPClassifier(hidden_layer_sizes=[128, 32], verbose=True), ewm_X, ewm_y, scoring='accuracy', cv=StratifiedKFold(), n_jobs=-1))

0.7820253286214434