# Training the competition model

Firstly load all data.

In [1]:
original_json_file_path = "./datasets/competition_final/generated/final_year_1_to_10_data.json"
year_11_json_file_path = "./datasets/competition_final/generated/final_year_11_data.json"

In [2]:
import pandas as pd

original_json_file_path = "./datasets/competition_final/generated/final_year_1_to_10_data.json"
year_11_json_file_path = "./datasets/competition_final/generated/final_year_11_data.json"

df_original = pd.read_json(original_json_file_path)
df_year_11 = pd.read_json(year_11_json_file_path)

df_year_11.head(20)

Unnamed: 0,tmID,year,playoff,averageWinRate,averagePoints,averageRebounds,averageAssists,averageSteals,averageBlocks,averageTurnovers,averageFGRatio,averageFTRatio,averageThreeRatio,coachWinRate,numberOfAwardedPlayers
0,WAS,11,,0.470588,244.629371,122.927739,47.384615,29.587413,12.310023,50.428904,0.418658,0.749509,0.280573,0.5,2
1,TUL,11,,0.5,196.2625,87.358333,39.875,19.675,8.566667,38.229167,0.414552,0.780216,0.327587,0.5,10
2,SEA,11,,0.588235,274.577622,106.67972,55.923077,26.044755,11.078322,49.041958,0.412312,0.802031,0.310246,0.588235,8
3,SAS,11,,0.441176,318.134545,111.044848,67.1,34.910909,14.202424,50.506061,0.428546,0.80828,0.340362,0.441176,2
4,PHO,11,,0.676471,259.051748,113.709557,49.538462,21.234965,13.69324,47.2331,0.444417,0.801936,0.335072,0.676471,6
5,NYL,11,,0.382353,239.084416,93.4329,46.714286,19.831169,8.073593,40.969697,0.396254,0.808497,0.311745,0.428571,2
6,MIN,11,,0.411765,285.395041,127.13168,61.545455,30.919008,10.638567,50.823691,0.446331,0.781802,0.320708,0.5,3
7,LAS,11,,0.529412,249.046281,122.257851,66.0,26.052893,14.456198,54.231405,0.4205,0.807764,0.311575,0.5,6
8,IND,11,,0.647059,284.894215,111.692011,51.090909,33.229752,16.182369,50.046832,0.409415,0.817711,0.328738,0.647059,6
9,CON,11,,0.470588,239.584848,93.972727,49.5,22.59697,9.50303,45.674242,0.430382,0.791903,0.28632,0.470588,1


Train the model

## Logistic Regression

In [18]:
# Model training...
from itertools import combinations, product
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Assume df is your DataFrame
# Define features and target

features = ['averageWinRate', 'averagePoints', 'averageRebounds', 'averageAssists',
            'averageSteals', 'averageBlocks', 'averageTurnovers', 'averageFGRatio',
            'averageFTRatio', 'averageThreeRatio', 'coachWinRate', 'numberOfAwardedPlayers']
target = 'playoff'

# Split data into training and test sets
X_train = df_original[df_original["year"] < 7][features + ['year']]
X_test = df_original[df_original["year"] >= 7][features + ['year']]
y_train = df_original[df_original["year"] < 7]["playoff"]
y_test = df_original[df_original["year"] >= 7]["playoff"]

df_year_only_train = X_train["year"]
df_year_only_test = X_test["year"]

best_accuracy_sum = 0
best_feature_combination = None
best_hyperparameters = None

# Define hyperparameters grid
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'tol': [1e-4, 1e-3, 1e-2],
    'max_iter': [10000]
}

# Generate all combinations of hyperparameters
all_params = [dict(zip(param_grid, v)) for v in product(*param_grid.values())]

# Iterate through all possible feature combinations with 2 to 6 features
results = []
for subset in tqdm(combinations(features, 4), desc="Feature Combinations"):
    selected_features = list(subset) + ['year']

    X_train_subset = X_train[selected_features]
    X_test_subset = X_test[selected_features]

    for params in all_params:
        model = LogisticRegression(**params)
        model.fit(X_train_subset, y_train)

        accuracy_test = accuracy_score(y_test, model.predict(X_test_subset))
        accuracy_train = accuracy_score(y_train, model.predict(X_train_subset))

        if accuracy_test > 0.75:
            results.append({
                'acc_train': accuracy_train,
                'acc_test': accuracy_test,
                'best_feature_combination': str(selected_features),
                'best_hyperparameters': str(params)
            })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

df_cd_json = results_df.to_json(orient="records")
new_file = open("./datasets/generated/logreg_hyper_param_tuning_2.json", "w")
new_file.writelines(df_cd_json)
new_file.close()

Feature Combinations: 37it [00:16,  2.22it/s]


KeyboardInterrupt: 

## SVM

In [15]:
# Model training...
from itertools import combinations, product
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn.svm import SVC

# Assume df is your DataFrame
# Define features and target
features = ['averageWinRate', 'averagePoints', 'averageRebounds', 'averageAssists',
            'averageSteals', 'averageBlocks', 'averageTurnovers', 'averageFGRatio',
            'averageFTRatio', 'averageThreeRatio', 'coachWinRate', 'numberOfAwardedPlayers']
target = 'playoff'

# Split data into training and test sets
X_train = df_original[df_original["year"] < 7][features + ['year']]
X_test = df_original[df_original["year"] >= 7][features + ['year']]
y_train = df_original[df_original["year"] < 7]["playoff"]
y_test = df_original[df_original["year"] >= 7]["playoff"]

df_year_only_train = X_train["year"]
df_year_only_test = X_test["year"]

best_accuracy_sum = 0
best_feature_combination = None
best_hyperparameters = None

# Define hyperparameters grid
param_grid = {
    'C': [0.1, 1, 10, 100],        # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto'],   # Kernel coefficient
}

# Generate all combinations of hyperparameters
all_params = [dict(zip(param_grid, v)) for v in product(*param_grid.values())]

# Iterate through all possible feature combinations with 2 to 6 features
results = pd.DataFrame(columns=['acc_train', 'acc_test', 'best_feature_combination', 'best_hyperparameters'])

# Always include 'year

X_train = pd.concat([X_train, df_year_only_train], axis=1)
X_test = pd.concat([X_test, df_year_only_test], axis=1)

svm = SVC(kernel='linear')  # You can change the kernel as needed (e.g., 'rbf' for radial basis function)

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Make predictions on the test set using the best model
y_pred = grid_search.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
    
results.head()
 

## KNN Evaluation

In [28]:
import pandas as pd

file = pd.read_json("./datasets/generated/model_evaluation_by_accuracy_knn.json")

file["diff"] = abs(file["acc_train"] - file["acc_test"])

df2 = file[(file["acc_test"] > 0.75) & (file["diff"] < 0.05)].sort_values("acc_test", axis=0, ascending=False)

df2.head()


Unnamed: 0,acc_train,acc_test,best_feature_combination,best_hyperparameters,diff
5663,0.805556,0.759259,"['averageAssists', 'averageTurnovers', 'number...","{'n_neighbors': 7, 'weights': 'uniform', 'metr...",0.046296
5664,0.805556,0.759259,"['averageAssists', 'averageTurnovers', 'number...","{'n_neighbors': 7, 'weights': 'uniform', 'metr...",0.046296
22419,0.805556,0.759259,"['averageAssists', 'averageTurnovers', 'averag...","{'n_neighbors': 7, 'weights': 'uniform', 'metr...",0.046296
22394,0.805556,0.759259,"['averageAssists', 'averageTurnovers', 'averag...","{'n_neighbors': 7, 'weights': 'uniform', 'metr...",0.046296
22393,0.805556,0.759259,"['averageAssists', 'averageTurnovers', 'averag...","{'n_neighbors': 7, 'weights': 'uniform', 'metr...",0.046296


Average accuracy:


In [27]:
df2["acc_test"].mean()

0.7592592593

## Logreg evaluation

In [22]:
#load libraries
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

data_file_path = "./datasets/competition_final/generated/final_year_1_to_10_data.json"
df = pd.read_json(data_file_path)
df.head()

features = ['averageWinRate', 'averageRebounds', 'averageBlocks', 'averageThreeRatio', 'coachWinRate', 'numberOfAwardedPlayers', 'year']

results_df = pd.DataFrame(columns=["year_split", "acc_test", "acc_train", "diff"])

for i in range(5,11):
  #Split dataset in such way that we use latest years for training and older years for testing
  X_train=df[df["year"]<i][features]
  X_test=df[df["year"]>=i][features]
  y_train=df[df["year"]<i]["playoff"]
  y_test=df[df["year"]>=i]["playoff"]

  models = {}

  #Logistic Regression
  models['Logistic Regression'] = LogisticRegression(C=1, penalty= 'l2', solver='newton-cg', tol=0.001, max_iter=10000)

  accuracy, precision, recall = {}, {}, {}

  # Fit the classifier
  models['Logistic Regression'].fit(X_train, y_train)

  # Make predictions
  y_pred = models['Logistic Regression'].predict(X_test)

  y_pred_test = models['Logistic Regression'].predict(X_train)

  temp_df = pd.DataFrame([[i, metrics.accuracy_score(y_test, y_pred), metrics.accuracy_score(y_train, y_pred_test), abs(metrics.accuracy_score(y_test, y_pred) - metrics.accuracy_score(y_train, y_pred_test))]], columns=["year_split", "acc_test", "acc_train", "diff"])
  
  results_df = pd.concat([results_df, temp_df])

  
results_df.head()


Unnamed: 0,year_split,acc_test,acc_train,diff
0,5,0.6875,0.673913,0.013587
0,6,0.731343,0.644068,0.087275
0,7,0.759259,0.708333,0.050926
0,8,0.65,0.744186,0.094186
0,9,0.62963,0.727273,0.097643


Average accuracy:

In [23]:
results_df["acc_test"].mean()

0.6660322851554195

## Conclusion

Logreg is better because it is more consistent.

Best is year 7 split.

After year 8 for some reason in all cases the precision and/or difference in betweeen test and train accuracy.

## Final prediction


In [32]:
from sklearn.linear_model import LogisticRegression
import pandas as pd

y1_10_data_path = "./datasets/competition_final/generated/final_year_1_to_10_data.json"
y11_data_path = "./datasets/competition_final/generated/final_year_11_data.json"

df_y1_10 = pd.read_json(y1_10_data_path)
df_y11 = pd.read_json(y11_data_path)

features = ['averageWinRate', 'averageRebounds', 'averageBlocks', 'averageThreeRatio', 'coachWinRate', 'numberOfAwardedPlayers', 'year']
target = 'playoff'

X_train = df_y1_10[df_y1_10["year"] < 7][features]
X_test = df_y1_10[df_y1_10["year"] >= 7][features]
y_train = df_y1_10[df_y1_10["year"] < 7][target]
y_test = df_y1_10[df_y1_10["year"] >= 7][target]

model = LogisticRegression(C=0.1, penalty= 'l2', solver= 'newton-cg', tol= 0.001, max_iter= 10000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_train=model.predict(X_train)

pred=model.predict(df_y11[features])

# Step 1: Convert predictions to a DataFrame
pred_df = pd.DataFrame(pred, columns=['label'])

# Step 2: Subset the original DataFrame to get the corresponding 'tmID' values
# for the test set
test_teams = df_y11[df_y11["year"] >= 7][['tmID']].reset_index(drop=True)

# Step 3: Join the DataFrames
result_df = pd.concat([test_teams, pred_df], axis=1)

result_df

Unnamed: 0,tmID,label
0,WAS,Y
1,TUL,Y
2,SEA,Y
3,SAS,Y
4,PHO,Y
5,NYL,Y
6,MIN,Y
7,LAS,Y
8,IND,Y
9,CON,Y
