In [101]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

### Load data

In [102]:
df = pd.read_csv("../data/teams.csv")

In [103]:
# Drop unnecessary/unfit columns
teams_df = df.drop(["rank", "seeded", "lgID", "tmID", "franchID", "confID", "divID", "name", "arena", "firstRound", "semis", "finals"], axis=1)

# Convert "playoff" column to binary (Y: 1, N: 0)
teams_df["playoff"] = teams_df["playoff"].map({"Y": 1, "N": 0})

### Training and evaluation function

In [104]:
from dataclasses import dataclass

@dataclass
class Result:
    year: int
    accuracy: float
    auc: float

results = {}

def classification(model, teams_df, min_year, max_year):
    for i in range(min_year, max_year + 1):
        teams_df_train = teams_df[teams_df['year'] < i]
        teams_df_test = teams_df[teams_df['year'] == i]

        X_train = teams_df_train.drop("playoff", axis=1)  # Features
        y_train = teams_df_train["playoff"]  # Target variable

        X_test = teams_df_test.drop("playoff", axis=1)  # Features
        y_test = teams_df_test["playoff"]  # Target variable

        print(f"\nTrain/Test size for year={i}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

        model.fit(X_train, y_train)

        # Predict the response for the test dataset
        y_pred = model.predict(X_test)

        print(f"\nyear = {i}, Classification Report:\n", classification_report(y_test, y_pred))
        print(f"\nyear = {i}, Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print(f"\nyear = {i}, AUC: ", roc_auc_score(y_test, y_pred))
        print(f"\nyear = {i}, Accuracy: ", accuracy_score(y_test, y_pred))

        result = Result(
            year=i,
            accuracy=accuracy_score(y_test, y_pred),
            auc=roc_auc_score(y_test, y_pred),
        )

        if (str(model) not in results):
            results[str(model)] = []
        results[str(model)].append(result)

min_year = 2
max_year = teams_df['year'].max()

### Decision Tree

In [105]:
model = DecisionTreeClassifier(random_state=42)
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.25      0.33         8
           1       0.50      0.75      0.60         8

    accuracy                           0.50        16
   macro avg       0.50      0.50      0.47        16
weighted avg       0.50      0.50      0.47        16


year = 2, Confusion Matrix:
 [[2 6]
 [2 6]]

year = 2, AUC:  0.5

year = 2, Accuracy:  0.5

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.78      0.88      0.82         8

    accuracy                           0.81        16
   macro avg       0.82      0.81      0.81        16
weighted avg       0.82      0.81      0.81        16


year = 3, Confusion Matrix:
 [[6 2]
 [1 7]]

year = 3, AUC:  0.81

### SVC

In [106]:
model = SVC(random_state=42)
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.88      0.67         8
           1       0.67      0.25      0.36         8

    accuracy                           0.56        16
   macro avg       0.60      0.56      0.52        16
weighted avg       0.60      0.56      0.52        16


year = 2, Confusion Matrix:
 [[7 1]
 [6 2]]

year = 2, AUC:  0.5625

year = 2, Accuracy:  0.5625

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.80         8
           1       1.00      0.50      0.67         8

    accuracy                           0.75        16
   macro avg       0.83      0.75      0.73        16
weighted avg       0.83      0.75      0.73        16


year = 3, Confusion Matrix:
 [[8 0]
 [4 4]]

year = 3, AUC:

### Logistic Regression

In [107]:
model = LogisticRegression(random_state=42, max_iter=100000, solver='saga')
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.88      0.78         8
           1       0.83      0.62      0.71         8

    accuracy                           0.75        16
   macro avg       0.77      0.75      0.75        16
weighted avg       0.77      0.75      0.75        16


year = 2, Confusion Matrix:
 [[7 1]
 [3 5]]

year = 2, AUC:  0.75

year = 2, Accuracy:  0.75

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.88      0.74         8
           1       0.80      0.50      0.62         8

    accuracy                           0.69        16
   macro avg       0.72      0.69      0.68        16
weighted avg       0.72      0.69      0.68        16


year = 3, Confusion Matrix:
 [[7 1]
 [4 4]]

year = 3, AUC:  0.


year = 4, Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.50      0.43         6
           1       0.50      0.38      0.43         8

    accuracy                           0.43        14
   macro avg       0.44      0.44      0.43        14
weighted avg       0.45      0.43      0.43        14


year = 4, Confusion Matrix:
 [[3 3]
 [5 3]]

year = 4, AUC:  0.4375

year = 4, Accuracy:  0.42857142857142855

Train/Test size for year=5: (62, 48) (13, 48) (62,) (13,)

year = 5, Classification Report:
               precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       1.00      0.75      0.86         8

    accuracy                           0.85        13
   macro avg       0.86      0.88      0.85        13
weighted avg       0.89      0.85      0.85        13


year = 5, Confusion Matrix:
 [[5 0]
 [2 6]]

year = 5, AUC:  0.875

year = 5, Accuracy:  0.84615384615384

### Random Forest

In [108]:
model = RandomForestClassifier(random_state=42)
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88         8
           1       0.88      0.88      0.88         8

    accuracy                           0.88        16
   macro avg       0.88      0.88      0.88        16
weighted avg       0.88      0.88      0.88        16


year = 2, Confusion Matrix:
 [[7 1]
 [1 7]]

year = 2, AUC:  0.875

year = 2, Accuracy:  0.875

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.89      1.00      0.94         8

    accuracy                           0.94        16
   macro avg       0.94      0.94      0.94        16
weighted avg       0.94      0.94      0.94        16


year = 3, Confusion Matrix:
 [[7 1]
 [0 8]]

year = 3, AUC:  

### K Neighbors

In [109]:
model = KNeighborsClassifier()
classification(model, teams_df, min_year, max_year)


Train/Test size for year=2: (16, 48) (16, 48) (16,) (16,)

year = 2, Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.88      0.67         8
           1       0.67      0.25      0.36         8

    accuracy                           0.56        16
   macro avg       0.60      0.56      0.52        16
weighted avg       0.60      0.56      0.52        16


year = 2, Confusion Matrix:
 [[7 1]
 [6 2]]

year = 2, AUC:  0.5625

year = 2, Accuracy:  0.5625

Train/Test size for year=3: (32, 48) (16, 48) (32,) (16,)

year = 3, Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.75      0.67         8
           1       0.67      0.50      0.57         8

    accuracy                           0.62        16
   macro avg       0.63      0.62      0.62        16
weighted avg       0.63      0.62      0.62        16


year = 3, Confusion Matrix:
 [[6 2]
 [4 4]]

year = 3, AUC:

# Results

In [110]:
from pprint import pprint

pprint(results)

{'DecisionTreeClassifier(random_state=42)': [Result(year=2,
                                                    accuracy=0.5,
                                                    auc=0.5),
                                             Result(year=3,
                                                    accuracy=0.8125,
                                                    auc=0.8125),
                                             Result(year=4,
                                                    accuracy=0.7857142857142857,
                                                    auc=0.8125),
                                             Result(year=5,
                                                    accuracy=0.9230769230769231,
                                                    auc=0.9375),
                                             Result(year=6,
                                                    accuracy=1.0,
                                                    auc=1.0),
                  