In [81]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

### Load data

In [82]:
df = pd.read_csv("../data/teams.csv")

In [83]:
# Drop unnecessary/unfit columns
teams_df = df.drop(["rank", "seeded", "lgID", "tmID", "franchID", "confID", "divID", "name", "arena", "firstRound", "semis", "finals"], axis=1)

# Convert "playoff" column to binary (Y: 1, N: 0)
teams_df["playoff"] = teams_df["playoff"].map({"Y": 1, "N": 0})

### Training and evaluation function

In [84]:
from dataclasses import dataclass

@dataclass
class Result:
    year: int
    accuracy: float
    auc: float

results = {}

def classification(model, teams_df, min_year, max_year):
    for i in range(min_year, max_year + 1):
        teams_df_train = teams_df[teams_df['year'] < i]
        teams_df_test = teams_df[teams_df['year'] == i]

        X_train = teams_df_train.drop("playoff", axis=1)  # Features
        y_train = teams_df_train["playoff"]  # Target variable

        X_test = teams_df_test.drop("playoff", axis=1)  # Features
        y_test = teams_df_test["playoff"]  # Target variable

        print(f"\nTrain/Test size for year={i}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

        model.fit(X_train, y_train)

        # Predict the response for the test dataset
        y_pred = model.predict(X_test)

        print(f"\nyear = {i}, Classification Report:\n", classification_report(y_test, y_pred))
        print(f"\nyear = {i}, Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print(f"\nyear = {i}, AUC: ", roc_auc_score(y_test, y_pred))
        print(f"\nyear = {i}, Accuracy: ", accuracy_score(y_test, y_pred))

        result = Result(
            year=i,
            accuracy=accuracy_score(y_test, y_pred),
            auc=roc_auc_score(y_test, y_pred),
        )

        if (str(model) not in results):
            results[str(model)] = []
        results[str(model)].append(result)

min_year = 2
max_year = teams_df['year'].max()

### Decision Tree

In [85]:
model = DecisionTreeClassifier(random_state=42)
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.25      0.33         8
           1       0.50      0.75      0.60         8

    accuracy                           0.50        16
   macro avg       0.50      0.50      0.47        16
weighted avg       0.50      0.50      0.47        16


year = 2, Confusion Matrix:
 [[2 6]
 [2 6]]

year = 2, AUC:  0.5

year = 2, Accuracy:  0.5

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.78      0.88      0.82         8

    accuracy                           0.81        16
   macro avg       0.82      0.81      0.81        16
weighted avg       0.82      0.81      0.81        16


year = 3, Confusion Matrix:
 [[6 2]
 [1 7]]

year = 3, AUC:  0.81

### SVC

In [86]:
model = SVC(random_state=42)
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.88      0.67         8
           1       0.67      0.25      0.36         8

    accuracy                           0.56        16
   macro avg       0.60      0.56      0.52        16
weighted avg       0.60      0.56      0.52        16


year = 2, Confusion Matrix:
 [[7 1]
 [6 2]]

year = 2, AUC:  0.5625

year = 2, Accuracy:  0.5625

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.80         8
           1       1.00      0.50      0.67         8

    accuracy                           0.75        16
   macro avg       0.83      0.75      0.73        16
weighted avg       0.83      0.75      0.73        16


year = 3, Confusion Matrix:
 [[8 0]
 [4 4]]

year = 3, AUC:

### Logistic Regression

In [87]:
model = LogisticRegression(random_state=42, max_iter=100000, solver='newton-cg')
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.75      0.86         8
           1       0.80      1.00      0.89         8

    accuracy                           0.88        16
   macro avg       0.90      0.88      0.87        16
weighted avg       0.90      0.88      0.87        16


year = 2, Confusion Matrix:
 [[6 2]
 [0 8]]

year = 2, AUC:  0.875

year = 2, Accuracy:  0.875

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.78      0.88      0.82         8

    accuracy                           0.81        16
   macro avg       0.82      0.81      0.81        16
weighted avg       0.82      0.81      0.81        16


year = 3, Confusion Matrix:
 [[6 2]
 [1 7]]

year = 3, AUC:  




year = 7, Classification Report:
               precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       1.00      0.75      0.86         8

    accuracy                           0.86        14
   macro avg       0.88      0.88      0.86        14
weighted avg       0.89      0.86      0.86        14


year = 7, Confusion Matrix:
 [[6 0]
 [2 6]]

year = 7, AUC:  0.875

year = 7, Accuracy:  0.8571428571428571

Train/Test size for year=8: (102, 48) (13, 48) (102,) (13,)





year = 8, Classification Report:
               precision    recall  f1-score   support

           0       0.45      1.00      0.62         5
           1       1.00      0.25      0.40         8

    accuracy                           0.54        13
   macro avg       0.73      0.62      0.51        13
weighted avg       0.79      0.54      0.49        13


year = 8, Confusion Matrix:
 [[5 0]
 [6 2]]

year = 8, AUC:  0.625

year = 8, Accuracy:  0.5384615384615384

Train/Test size for year=9: (115, 48) (14, 48) (115,) (14,)





year = 9, Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.83      0.91         6
           1       0.89      1.00      0.94         8

    accuracy                           0.93        14
   macro avg       0.94      0.92      0.93        14
weighted avg       0.94      0.93      0.93        14


year = 9, Confusion Matrix:
 [[5 1]
 [0 8]]

year = 9, AUC:  0.9166666666666667

year = 9, Accuracy:  0.9285714285714286

Train/Test size for year=10: (129, 48) (13, 48) (129,) (13,)





year = 10, Classification Report:
               precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       1.00      0.75      0.86         8

    accuracy                           0.85        13
   macro avg       0.86      0.88      0.85        13
weighted avg       0.89      0.85      0.85        13


year = 10, Confusion Matrix:
 [[5 0]
 [2 6]]

year = 10, AUC:  0.875

year = 10, Accuracy:  0.8461538461538461


### Random Forest

In [88]:
model = RandomForestClassifier(random_state=42)
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88         8
           1       0.88      0.88      0.88         8

    accuracy                           0.88        16
   macro avg       0.88      0.88      0.88        16
weighted avg       0.88      0.88      0.88        16


year = 2, Confusion Matrix:
 [[7 1]
 [1 7]]

year = 2, AUC:  0.875

year = 2, Accuracy:  0.875

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.89      1.00      0.94         8

    accuracy                           0.94        16
   macro avg       0.94      0.94      0.94        16
weighted avg       0.94      0.94      0.94        16


year = 3, Confusion Matrix:
 [[7 1]
 [0 8]]

year = 3, AUC:  

### K Neighbors

In [89]:
model = KNeighborsClassifier()
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.88      0.67         8
           1       0.67      0.25      0.36         8

    accuracy                           0.56        16
   macro avg       0.60      0.56      0.52        16
weighted avg       0.60      0.56      0.52        16


year = 2, Confusion Matrix:
 [[7 1]
 [6 2]]

year = 2, AUC:  0.5625

year = 2, Accuracy:  0.5625

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.75      0.67         8
           1       0.67      0.50      0.57         8

    accuracy                           0.62        16
   macro avg       0.63      0.62      0.62        16
weighted avg       0.63      0.62      0.62        16


year = 3, Confusion Matrix:
 [[6 2]
 [4 4]]

year = 3, AUC:

# Results

In [108]:
from pprint import pprint

# This is ugly
pprint(dict(sorted(dict(map(lambda i: (i[0], i[1].accuracy), dict(map(lambda i: (i[0], i[1][-1]), results.items())).items())).items(), key=lambda i: i[1], reverse=True)))

{'DecisionTreeClassifier(random_state=42)': 0.6923076923076923,
 'KNeighborsClassifier()': 0.6923076923076923,
 "LogisticRegression(max_iter=100000, random_state=42, solver='newton-cg')": 0.8461538461538461,
 'RandomForestClassifier(random_state=42)': 0.7692307692307693,
 'SVC(random_state=42)': 0.6923076923076923}
