# Training the competition model

Firstly load all data.

In [2]:
original_json_file_path = "./datasets/competition_final/generated/final_year_1_to_10_data.json"
year_11_json_file_path = "./datasets/competition_final/generated/final_year_11_data.json"

In [3]:
import pandas as pd

original_json_file_path = "./datasets/competition_final/generated/final_year_1_to_10_data.json"
year_11_json_file_path = "./datasets/competition_final/generated/final_year_11_data.json"

df_original = pd.read_json(original_json_file_path)
df_year_11 = pd.read_json(year_11_json_file_path)

df_year_11.head(20)

Unnamed: 0,tmID,year,playoff,averageWinRate,averagePoints,averageRebounds,averageAssists,averageSteals,averageBlocks,averageTurnovers,averageFGRatio,averageFTRatio,averageThreeRatio,coachWinRate,numberOfAwardedPlayers
0,WAS,11,,0.470588,244.629371,122.927739,47.384615,29.587413,12.310023,50.428904,0.418658,0.749509,0.280573,0.5,2
1,TUL,11,,0.5,196.2625,87.358333,39.875,19.675,8.566667,38.229167,0.414552,0.780216,0.327587,0.5,10
2,SEA,11,,0.588235,274.577622,106.67972,55.923077,26.044755,11.078322,49.041958,0.412312,0.802031,0.310246,0.588235,8
3,SAS,11,,0.441176,318.134545,111.044848,67.1,34.910909,14.202424,50.506061,0.428546,0.80828,0.340362,0.441176,2
4,PHO,11,,0.676471,259.051748,113.709557,49.538462,21.234965,13.69324,47.2331,0.444417,0.801936,0.335072,0.676471,6
5,NYL,11,,0.382353,239.084416,93.4329,46.714286,19.831169,8.073593,40.969697,0.396254,0.808497,0.311745,0.428571,2
6,MIN,11,,0.411765,285.395041,127.13168,61.545455,30.919008,10.638567,50.823691,0.446331,0.781802,0.320708,0.5,3
7,LAS,11,,0.529412,249.046281,122.257851,66.0,26.052893,14.456198,54.231405,0.4205,0.807764,0.311575,0.5,6
8,IND,11,,0.647059,284.894215,111.692011,51.090909,33.229752,16.182369,50.046832,0.409415,0.817711,0.328738,0.647059,6
9,CON,11,,0.470588,239.584848,93.972727,49.5,22.59697,9.50303,45.674242,0.430382,0.791903,0.28632,0.470588,1


Train the model

In [22]:
# Model training...
from itertools import combinations, product
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Assume df is your DataFrame
# Define features and target
features = ['averageWinRate', 'averagePoints', 'averageRebounds', 'averageAssists',
            'averageSteals', 'averageBlocks', 'averageTurnovers', 'averageFGRatio',
            'averageFTRatio', 'averageThreeRatio', 'coachWinRate', 'numberOfAwardedPlayers']
target = 'playoff'

# Split data into training and test sets
X_train = df_original[df_original["year"] < 7][features + ['year']]
X_test = df_original[df_original["year"] >= 7][features + ['year']]
y_train = df_original[df_original["year"] < 7]["playoff"]
y_test = df_original[df_original["year"] >= 7]["playoff"]

df_year_only_train = X_train["year"]
df_year_only_test = X_test["year"]

best_accuracy_sum = 0
best_feature_combination = None
best_hyperparameters = None

# Define hyperparameters grid
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
    'tol': [1e-4, 1e-3, 1e-2],
    'max_iter': [10000]
}

# Generate all combinations of hyperparameters
all_params = [dict(zip(param_grid, v)) for v in product(*param_grid.values())]

# Iterate through all possible feature combinations with 2 to 6 features
results = pd.DataFrame(columns=['acc_train', 'acc_test', 'best_feature_combination', 'best_hyperparameters'])

for subset in tqdm(combinations(features, 2), desc="Feature Combinations"):
    selected_features = list(subset)
    selected_features.append('year')  # Always include 'year

    X_train_subset = X_train[selected_features]
    X_test_subset = X_test[selected_features]

    for params in all_params:
        # Create a Logistic Regression model with given hyperparameters
        model = LogisticRegression(**params)
        model.fit(X_train_subset, y_train)
        
        X_train = pd.concat([X_train, df_year_only_train], axis=1)
        X_test = pd.concat([X_test, df_year_only_test], axis=1)

        # Evaluate the model on the test set
        y_pred_test = model.predict(X_test_subset)
        accuracy_test = accuracy_score(y_test, y_pred_test)
        y_pred_train = model.predict(X_train_subset)
        accuracy_train = accuracy_score(y_train, y_pred_train)

        # Check if this combination is the best so far
        accuracy_sum = accuracy_train + accuracy_test

        if accuracy_test > 0.65:
            temp_row = pd.DataFrame([{
                'acc_train': accuracy_train,
                'acc_test': accuracy_test,
                'best_feature_combination': str(selected_features),
                'best_hyperparameters': str(params)
            }])
            results = pd.concat([results, temp_row])
    
    break

df_cd_json = results.to_json(orient="records")
new_file = open("./datasets/competition_final/generated/model_evaluation_by_accuracy.json", "w")
new_file.writelines(df_cd_json)
new_file.close()

results.head()
 

Feature Combinations: 0it [00:00, ?it/s]

Feature Combinations: 0it [00:00, ?it/s]


Unnamed: 0,acc_train,acc_test,best_feature_combination,best_hyperparameters
0,0.680556,0.666667,"['averageWinRate', 'averagePoints', 'year']","{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-..."
0,0.680556,0.666667,"['averageWinRate', 'averagePoints', 'year']","{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-..."
0,0.680556,0.666667,"['averageWinRate', 'averagePoints', 'year']","{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-..."
0,0.680556,0.666667,"['averageWinRate', 'averagePoints', 'year']","{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs',..."
0,0.680556,0.666667,"['averageWinRate', 'averagePoints', 'year']","{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs',..."
