In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

### Load data

In [2]:
df = pd.read_csv("../data/clean/main_df.csv")

In [3]:
# Drop unnecessary/unfit columns

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})

def map_strings_to_int(df, col):
    """
    Maps string values to integers
    """
    values = df[col].unique()
    mapping = {value: i for i, value in enumerate(values)}
    df[col] = df[col].map(mapping)

map_strings_to_int(df, "tmID")
map_strings_to_int(df, "playerID")
map_strings_to_int(df, "coachID")
map_strings_to_int(df, "pos")

df.to_csv("../data/clean/pre.csv", index=False)

print(df['tmID'].to_frame())


      tmID
0        0
1        0
2        0
3        0
4        0
...    ...
2155    19
2156    19
2157    19
2158    19
2159    19

[2160 rows x 1 columns]


### Training and evaluation function

In [4]:
from dataclasses import dataclass

@dataclass
class Result:
    year: int
    accuracy: float
    auc: float

results = {}

def classification(model, df, min_year, max_year):
    for i in range(min_year, max_year + 1):
        df_train = df[df['year'] < i]
        df_test = df[df['year'] == i]

        X_train = df_train.drop("playoff", axis=1)  # Features
        y_train = df_train["playoff"]  # Target variable

        X_test = df_test.drop("playoff", axis=1)  # Features
        y_test = df_test["playoff"]  # Target variable

        print(f"\nTrain/Test size for year={i}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

        model.fit(X_train, y_train)

        # Predict the response for the test dataset
        y_pred = model.predict(X_test)

        print(f"\nyear = {i}, Classification Report:\n", classification_report(y_test, y_pred))
        print(f"\nyear = {i}, Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print(f"\nyear = {i}, AUC: ", roc_auc_score(y_test, y_pred))
        print(f"\nyear = {i}, Accuracy: ", accuracy_score(y_test, y_pred))

        result = Result(
            year=i,
            accuracy=accuracy_score(y_test, y_pred),
            auc=roc_auc_score(y_test, y_pred),
        )

        if (str(model) not in results):
            results[str(model)] = []
        results[str(model)].append(result)

min_year = 2
max_year = df['year'].max()

### Decision Tree

In [5]:
model = DecisionTreeClassifier(random_state=42)
classification(model, df, min_year, max_year)


Train/Test size for year=2: (222, 92) (235, 92) (222,) (235,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.23      0.29       107
           1       0.52      0.70      0.60       128

    accuracy                           0.49       235
   macro avg       0.46      0.46      0.44       235
weighted avg       0.46      0.49      0.46       235


year = 2, Confusion Matrix:
 [[25 82]
 [39 89]]

year = 2, AUC:  0.464478679906542

year = 2, Accuracy:  0.4851063829787234

Train/Test size for year=3: (457, 92) (261, 92) (457,) (261,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.81      0.81       153
           1       0.73      0.74      0.74       108

    accuracy                           0.78       261
   macro avg       0.77      0.78      0.78       261
weighted avg       0.78      0.78      0.78       261


year = 3, Confusion Matr

### SVC

In [6]:
model = SVC(random_state=42)
classification(model, df, min_year, max_year)


Train/Test size for year=2: (222, 92) (235, 92) (222,) (235,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       107
           1       0.54      1.00      0.71       128

    accuracy                           0.54       235
   macro avg       0.27      0.50      0.35       235
weighted avg       0.30      0.54      0.38       235


year = 2, Confusion Matrix:
 [[  0 107]
 [  0 128]]

year = 2, AUC:  0.5

year = 2, Accuracy:  0.5446808510638298

Train/Test size for year=3: (457, 92) (261, 92) (457,) (261,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       153
           1       0.41      1.00      0.59       108

    accuracy                           0.41       261
   macro avg       0.21      0.50      0.29       261
weighted avg       0.17      0.41      0.24       261


year = 3, Confusion Matrix:
 [[  0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



year = 5, Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.95      0.65        83
           1       0.94      0.42      0.58       140

    accuracy                           0.62       223
   macro avg       0.72      0.69      0.62       223
weighted avg       0.77      0.62      0.61       223


year = 5, Confusion Matrix:
 [[79  4]
 [81 59]]

year = 5, AUC:  0.6866179001721171

year = 5, Accuracy:  0.6188340807174888

Train/Test size for year=6: (1143, 92) (209, 92) (1143,) (209,)

year = 6, Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.62      0.73        87
           1       0.78      0.94      0.85       122

    accuracy                           0.81       209
   macro avg       0.83      0.78      0.79       209
weighted avg       0.82      0.81      0.80       209


year = 6, Confusion Matrix:
 [[ 54  33]
 [  7 115]]

year = 6, AUC:  0.7816563029960

### Logistic Regression

In [7]:
model = LogisticRegression(random_state=42, max_iter=100000, solver='newton-cg')
classification(model, df, min_year, max_year)


Train/Test size for year=2: (222, 92) (235, 92) (222,) (235,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.70      0.82       107
           1       0.80      1.00      0.89       128

    accuracy                           0.86       235
   macro avg       0.90      0.85      0.86       235
weighted avg       0.89      0.86      0.86       235


year = 2, Confusion Matrix:
 [[ 75  32]
 [  0 128]]

year = 2, AUC:  0.8504672897196262

year = 2, Accuracy:  0.8638297872340426

Train/Test size for year=3: (457, 92) (261, 92) (457,) (261,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87       153
           1       0.82      0.80      0.81       108

    accuracy                           0.84       261
   macro avg       0.84      0.84      0.84       261
weighted avg       0.84      0.84      0.84       261


year = 3, Confusion




year = 10, Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.70      0.80        87
           1       0.82      0.97      0.89       121

    accuracy                           0.86       208
   macro avg       0.88      0.83      0.84       208
weighted avg       0.87      0.86      0.85       208


year = 10, Confusion Matrix:
 [[ 61  26]
 [  4 117]]

year = 10, AUC:  0.8340457870238434

year = 10, Accuracy:  0.8557692307692307


### Random Forest

In [8]:
model = RandomForestClassifier(random_state=42)
classification(model, df, min_year, max_year)


Train/Test size for year=2: (222, 92) (235, 92) (222,) (235,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       107
           1       1.00      1.00      1.00       128

    accuracy                           1.00       235
   macro avg       1.00      1.00      1.00       235
weighted avg       1.00      1.00      1.00       235


year = 2, Confusion Matrix:
 [[107   0]
 [  0 128]]

year = 2, AUC:  1.0

year = 2, Accuracy:  1.0

Train/Test size for year=3: (457, 92) (261, 92) (457,) (261,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.91      0.95       153
           1       0.89      1.00      0.94       108

    accuracy                           0.95       261
   macro avg       0.94      0.95      0.95       261
weighted avg       0.95      0.95      0.95       261


year = 3, Confusion Matrix:
 [[139  14]
 [  0 108

### K Neighbors

In [9]:
model = KNeighborsClassifier()
classification(model, df, min_year, max_year)


Train/Test size for year=2: (222, 92) (235, 92) (222,) (235,)



year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.53      0.64       107
           1       0.70      0.90      0.78       128

    accuracy                           0.73       235
   macro avg       0.76      0.72      0.71       235
weighted avg       0.75      0.73      0.72       235


year = 2, Confusion Matrix:
 [[ 57  50]
 [ 13 115]]

year = 2, AUC:  0.7155738901869159

year = 2, Accuracy:  0.7319148936170212

Train/Test size for year=3: (457, 92) (261, 92) (457,) (261,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.79      0.77       153
           1       0.68      0.62      0.65       108

    accuracy                           0.72       261
   macro avg       0.71      0.71      0.71       261
weighted avg       0.72      0.72      0.72       261


year = 3, Confusion Matrix:
 [[121  32]
 [ 41  67]]

year = 3, AUC:  0.70561002178

# Results

In [13]:
from pprint import pprint

display(results)

# This is ugly
print("train/test results per model for the last year of data:")
pprint(dict(sorted(dict(map(lambda i: (i[0], i[1].accuracy), dict(map(lambda i: (i[0], i[1][-1]), results.items())).items())).items(), key=lambda i: i[1], reverse=True)))

for model in results:
    for result in results[model]:
        if result.accuracy == 1:
            print(f"\n{model} has 100% accuracy for year {result.year}, possible data leakage")

{'DecisionTreeClassifier(random_state=42)': [Result(year=2, accuracy=0.4851063829787234, auc=0.464478679906542),
  Result(year=3, accuracy=0.7816091954022989, auc=0.775599128540305),
  Result(year=4, accuracy=0.7821782178217822, auc=0.7924528301886793),
  Result(year=5, accuracy=0.9910313901345291, auc=0.9928571428571429),
  Result(year=6, accuracy=0.8660287081339713, auc=0.8852459016393442),
  Result(year=7, accuracy=0.654639175257732, auc=0.6235955056179776),
  Result(year=8, accuracy=1.0, auc=1.0),
  Result(year=9, accuracy=1.0, auc=1.0),
  Result(year=10, accuracy=1.0, auc=1.0)],
 'SVC(random_state=42)': [Result(year=2, accuracy=0.5446808510638298, auc=0.5),
  Result(year=3, accuracy=0.41379310344827586, auc=0.5),
  Result(year=4, accuracy=0.599009900990099, auc=0.6164504716981132),
  Result(year=5, accuracy=0.6188340807174888, auc=0.6866179001721171),
  Result(year=6, accuracy=0.8086124401913876, auc=0.7816563029960429),
  Result(year=7, accuracy=0.6907216494845361, auc=0.66292134

train/test results per model for the last year of data:
{'DecisionTreeClassifier(random_state=42)': 1.0,
 'KNeighborsClassifier()': 0.5432692307692307,
 "LogisticRegression(max_iter=100000, random_state=42, solver='newton-cg')": 0.8557692307692307,
 'RandomForestClassifier(random_state=42)': 0.9711538461538461,
 'SVC(random_state=42)': 0.6826923076923077}

DecisionTreeClassifier(random_state=42) has 100% accuracy for year 8, possible data leakage

DecisionTreeClassifier(random_state=42) has 100% accuracy for year 9, possible data leakage

DecisionTreeClassifier(random_state=42) has 100% accuracy for year 10, possible data leakage

LogisticRegression(max_iter=100000, random_state=42, solver='newton-cg') has 100% accuracy for year 9, possible data leakage

RandomForestClassifier(random_state=42) has 100% accuracy for year 2, possible data leakage

RandomForestClassifier(random_state=42) has 100% accuracy for year 4, possible data leakage

RandomForestClassifier(random_state=42) has 100% 