In [1]:
import pandas as pd
import seaborn as sns
import sklearn
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Evaluating models with default parameters and all featuress

In [2]:
df_paths = {
    "matches_gathered": '../../../data/ncaa/processed/2019/accumulated/matches_gathered.csv',
    "sma": '../../../data/ncaa/processed/2019/accumulated/10_sma.csv',
    "cma": '../../../data/ncaa/processed/2019/accumulated/cma.csv',
    "ewm": '../../../data/ncaa/processed/2019/accumulated/0.2_ewm.csv',
}

features = ["Team A Kills", "Team A Errors", "Team A Total Attacks", "Team A Hit Pct", "Team A Assists", "Team A Aces", "Team A SErr", "Team A Digs", "Team A RErr", "Team A Block Solos", "Team A Block Assists", "Team A BErr", "Team A PTS", "Team B Kills", "Team B Errors", "Team B Total Attacks", "Team B Hit Pct", "Team B Assists", "Team B Aces", "Team B SErr", "Team B Digs", "Team B RErr", "Team B Block Solos", "Team B Block Assists", "Team B BErr", "Team B PTS"]

for name, path in df_paths.items():
    print(f"\n{name} -")
    df = pd.read_csv(path)
    X = df[features]
    y = df["Result"]
    print(f"X.shape: {X.shape}, y.shape: {y.shape}")

    models = [
        XGBClassifier(),
        MLPClassifier(),
        RandomForestClassifier(),
        LogisticRegression(),
        SVC(),
        DecisionTreeClassifier(),
    ]
    result_dict = {"Model": [], "Accuracy Mean": [], "Accuracy Dev": []}
    for m in models:
        scores = cross_val_score(m, X, y, scoring="accuracy")
        result_dict["Model"].append(m.__class__.__name__)
        result_dict["Accuracy Mean"].append(scores.mean())
        result_dict["Accuracy Dev"].append(scores.std())

    results_df = pd.DataFrame(result_dict)
    print(f"Result on {name} -")
    print(results_df)
    print("-" * 30)
    del df



matches_gathered -
X.shape: (9536, 26), y.shape: (9536,)
Result on matches_gathered -
                    Model  Accuracy Mean  Accuracy Dev
0           XGBClassifier       0.917051      0.005935
1           MLPClassifier       0.929844      0.007670
2  RandomForestClassifier       0.911912      0.008000
3      LogisticRegression       0.932990      0.007158
4                     SVC       0.924077      0.007954
5  DecisionTreeClassifier       0.872168      0.010175
------------------------------

sma -
X.shape: (9536, 26), y.shape: (9536,)
Result on sma -
                    Model  Accuracy Mean  Accuracy Dev
0           XGBClassifier       0.778524      0.006745
1           MLPClassifier       0.781248      0.026446
2  RandomForestClassifier       0.784605      0.007696
3      LogisticRegression       0.802537      0.008363
4                     SVC       0.787121      0.013023
5  DecisionTreeClassifier       0.694944      0.007232
------------------------------

cma -
X.shape: (953