In [None]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

### Load data

In [None]:
df = pd.read_csv("../data/teams.csv")

In [None]:
# Drop unnecessary/unfit columns
teams_df = df.drop(["rank", "seeded", "lgID", "tmID", "franchID", "confID", "divID", "name", "arena", "firstRound", "semis", "finals"], axis=1)

# Convert "playoff" column to binary (Y: 1, N: 0)
teams_df["playoff"] = teams_df["playoff"].map({"Y": 1, "N": 0})

### Training and evaluation function

In [None]:
def classification(model, teams_df, min_year, max_year):
    for i in range(min_year, max_year + 1):
        teams_df_train = teams_df[teams_df['year'] < i]
        teams_df_test = teams_df[teams_df['year'] == i]

        X_train = teams_df_train.drop("playoff", axis=1)  # Features
        y_train = teams_df_train["playoff"]  # Target variable

        X_test = teams_df_test.drop("playoff", axis=1)  # Features
        y_test = teams_df_test["playoff"]  # Target variable

        print(f"\nTrain/Test size for year={i}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

        model.fit(X_train, y_train)

        # Predict the response for the test dataset
        y_pred = model.predict(X_test)

        print(f"\nyear = {i}, Accuracy: ", accuracy_score(y_test, y_pred))
        print(f"\nyear = {i}, Classification Report:\n", classification_report(y_test, y_pred))
        print(f"\nyear = {i}, Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print(f"\nyear = {i}, AUC: ", roc_auc_score(y_test, y_pred))

min_year = 2
max_year = teams_df['year'].max()

### Decision Tree

In [None]:
model = DecisionTreeClassifier(random_state=42)
classification(model, teams_df, min_year, max_year)

### SVC

In [None]:
model = SVC(random_state=42)
classification(model, teams_df, min_year, max_year)

### Logistic Regression

In [None]:
model = LogisticRegression(random_state=42, max_iter=100000, solver='saga')
classification(model, teams_df, min_year, max_year)

### Random Forest

In [None]:
model = RandomForestClassifier(random_state=42)
classification(model, teams_df, min_year, max_year)

### K Neighbors

In [None]:
model = KNeighborsClassifier()
classification(model, teams_df, min_year, max_year)