In [163]:
import numpy as np
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# załadowanie pliku z przygotowanymi danymi
data = pd.read_csv('..\\data\\merged_season_stats.csv')

# wyodrębnienie wierszy, które wskazują na zwycięzce w danym sezonie
# winners = data[data['championshipWinner'] == 1].copy()
# data = data[data['championshipWinner'] != 1].copy()

# randomizacja kolejności zwycięzców
# winners = winners.sample(frac=1, random_state=42).reset_index(drop=True)

# podział zwycięzców na 2 równe części
# half = len(winners) // 2
# winners_train = winners.iloc[:half]
# winners_test = winners.iloc[half:]

# funkcja skalująca
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(data))
# winners_train_scaled = pd.DataFrame(scaler.transform(winners_train))
# winners_test_scaled = pd.DataFrame(scaler.transform(winners_test))

# podział danych na zbiory uczące i testowe
data_train, data_test = train_test_split(scaled_data, test_size=0.2)
x_train = pd.DataFrame(data_train[data_train.columns[:-1]])
y_train = pd.Series(data_train[data_train.columns[-1]])
x_test = pd.DataFrame(data_test[data_test.columns[:-1]])
y_test = pd.Series(data_test[data_test.columns[-1]])

# złączenie tabel ze zbioru uczącego i testowego 
# z tabelami zawierającymi zwycięzców w danym sezonie
# x_train = pd.concat([x_train, winners_train_scaled[winners_train_scaled.columns[:-1]]])
# y_train = pd.concat([y_train, winners_train_scaled[winners_train_scaled.columns[-1]]])

# x_test = pd.concat([x_test, winners_test_scaled[winners_test_scaled.columns[:-1]]])
# y_test = pd.concat([y_test, winners_test_scaled[winners_test_scaled.columns[-1]]])

# kodowanie wektora y
label_encoder = LabelEncoder()
y_train_encoded = pd.Series(label_encoder.fit_transform(y_train))
y_test_encoded = pd.Series(label_encoder.fit_transform(y_test))

In [164]:
# definicja modelu
model = LogisticRegression()
model_reg = LogisticRegression(penalty="l2", C=0.1, random_state=44)
model.fit(x_train, y_train_encoded)
model_reg.fit(x_train, y_train_encoded)

# predykcja modelu
y_predicted = model.predict(x_test)
y_predicted_reg = model_reg.predict(x_test)

# ewaluacja modelu
accuracy = accuracy_score(y_test_encoded, y_predicted)
precision = precision_score(y_test_encoded, y_predicted)
recall = recall_score(y_test_encoded, y_predicted)
f1 = f1_score(y_test_encoded, y_predicted)
accuracy_reg = accuracy_score(y_test_encoded, y_predicted)
precision_reg = precision_score(y_test_encoded, y_predicted)
recall_reg = recall_score(y_test_encoded, y_predicted)
f1_reg = f1_score(y_test_encoded, y_predicted)
rmse = np.sqrt(mean_squared_error(y_test, y_predicted))
rmse_reg = np.sqrt(mean_squared_error(y_test, y_predicted_reg))

print(f"Accuracy: \t\t\t\t\t\t\t{accuracy}")
print(f"Accuracy with ridge regularization: {accuracy_reg}")
print(f"Precision: \t\t\t\t\t\t\t{precision}")
print(f"Precision with ridge regularization:{precision_reg}")
print(f"Recall: \t\t\t\t\t\t\t{recall}")
print(f"Recall with ridge regularization: \t{recall_reg}")
print(f"F1-Score: \t\t\t\t\t\t\t{f1}")
print(f"F1-Score with ridge regularization: {f1_reg}")
print(f"RMSE: \t\t\t\t\t\t{rmse}")
print(f"RMSE with regularization: \t{rmse_reg}")


Accuracy: 							0.9907407407407407
Accuracy with ridge regularization: 0.9907407407407407
Precision: 							0.6666666666666666
Precision with ridge regularization:0.6666666666666666
Recall: 							1.0
Recall with ridge regularization: 	1.0
F1-Score: 							0.8
F1-Score with ridge regularization: 0.8
RMSE: 						0.574097892857928
RMSE with regularization: 	0.5625946513705676


In [165]:
# Add the predicted values as a new column to the test dataset
data_test_with_predictions = x_test.copy()  # Copy feature data
data_test_with_predictions['actual_value'] = y_test.values  # Add actual values
data_test_with_predictions['predicted_value'] = label_encoder.inverse_transform(y_predicted)  # Add predicted values

# Display the table for comparison
data_test_with_predictions

Unnamed: 0,0,1,2,3,actual_value,predicted_value
106,1.074547,0.374321,-0.373128,-1.277453,-0.206085,-0.206085
404,-0.788601,-0.363671,-0.373128,1.155336,-0.206085,-0.206085
183,-0.221020,-0.103653,-0.373128,0.146044,-0.206085,-0.206085
244,0.827773,0.262642,-0.373128,-1.125756,-0.206085,-0.206085
509,0.099200,0.044486,-0.373128,0.388935,-0.206085,-0.206085
...,...,...,...,...,...,...
121,0.043900,-0.259336,-0.373128,-0.671259,-0.206085,-0.206085
110,0.697127,0.396171,-0.373128,-1.525311,-0.206085,-0.206085
21,-0.251866,-0.126869,-0.373128,-1.063982,-0.206085,-0.206085
354,-1.302282,-0.922318,0.590935,0.549939,-0.206085,-0.206085
