In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import yaml
import joblib
import os

seed=456

In [8]:
##  Extracting data
with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

model_path = config["model"]["path"]
output_path = config["output"]["path"]

train_df = pd.read_csv(config["data"]["processed_train_path"])
test_df = pd.read_csv(config["data"]["processed_test_path"])

# Define our features (X) and target (y)
features = [col for col in train_df.columns if col not in ['battle_id', 'player_won']]
X_train = train_df[features]
y_train = train_df['player_won']
#print(features)

# Scaling all the data
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test = scaler.transform(test_df[features])



KeyError: 'player_won'

## Logistic Regression

In [9]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(random_state=seed)

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [5000]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Migliori iperparametri trovati:", grid_search.best_params_)
print("Best accuracy in CV:", grid_search.best_score_)


best_model = grid_search.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Migliori iperparametri trovati: {'C': 100, 'max_iter': 5000, 'penalty': 'l2', 'solver': 'liblinear'}
Best accuracy in CV: 0.8051


In [4]:
grid_search.cv_results_

{'mean_fit_time': array([ 0.33002639,  0.23969507,  0.58567228,  0.39860835,  1.22505851,
         2.03164034,  2.64925656, 10.28029566,  3.64207182, 32.97048173,
         3.80582242, 38.35050764]),
 'std_fit_time': array([0.03074803, 0.03248477, 0.06942841, 0.05420295, 0.31453867,
        0.33875925, 0.28599107, 0.39071159, 0.36603899, 2.64105369,
        0.31037185, 1.73493911]),
 'mean_score_time': array([0.00468583, 0.0037075 , 0.00377154, 0.00455618, 0.00425205,
        0.00765996, 0.00379   , 0.00165033, 0.00300794, 0.00490422,
        0.0023665 , 0.00117726]),
 'std_score_time': array([2.99103806e-03, 7.68562703e-04, 3.88056161e-04, 5.77450192e-04,
        1.50845578e-03, 8.25461984e-03, 9.62557436e-04, 2.70506236e-04,
        4.63455237e-04, 1.24911818e-03, 7.88924569e-04, 9.78466363e-05]),
 'param_C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1.0, 1.0, 10.0,
                    10.0, 100.0, 100.0],
              mask=[False, False, False, False, False, False, Fals

In [5]:
# Testing the prediction on the test set

print("Generating predictions on the test set...")
test_predictions = best_model.predict(X_test)

Generating predictions on the test set...


In [6]:

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'battle_id': test_df['battle_id'],
    'player_won': test_predictions
})
# Save the DataFrame to a .csv file
submission_df.to_csv(output_path, index=False)

print("\n'submission.csv' file created successfully!")


'submission.csv' file created successfully!
