In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# załadowanie pliku z przygotowanymi danymi
data = pd.read_csv('..\\data\\merged_season_stats.csv')

# wyodrębnienie wierszy, które wskazują na zwycięzce w danym sezonie
winners = data[data['championshipWinner'] == 1].copy()
data = data[data['championshipWinner'] != 1].copy()

# randomizacja kolejności zwycięzców
winners = winners.sample(frac=1, random_state=42).reset_index(drop=True)

# podział zwycięzców na 2 równe części
half = len(winners) // 2
winners_train = winners.iloc[:half]
winners_test = winners.iloc[half:]

# funkcja skalująca
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(data))
winners_train_scaled = pd.DataFrame(scaler.transform(winners_train))
winners_test_scaled = pd.DataFrame(scaler.transform(winners_test))

# podział danych na zbiory uczące i testowe
data_train, data_test = train_test_split(scaled_data, test_size=0.2)
x_train = pd.DataFrame(data_train[data_train.columns[:-1]])
y_train = pd.Series(data_train[data_train.columns[-1]])
x_test = pd.DataFrame(data_test[data_test.columns[:-1]])
y_test = pd.Series(data_test[data_test.columns[-1]])

# złączenie tabel ze zbioru uczącego i testowego 
# z tabelami zawierającymi zwycięzców w danym sezonie
x_train = pd.concat([x_train, winners_train_scaled[winners_train_scaled.columns[:-1]]])
y_train = pd.concat([y_train, winners_train_scaled[winners_train_scaled.columns[-1]]])

x_test = pd.concat([x_test, winners_test_scaled[winners_test_scaled.columns[:-1]]])
y_test = pd.concat([y_test, winners_test_scaled[winners_test_scaled.columns[-1]]])

# kodowanie wektora y
label_encoder = LabelEncoder()
y_train_encoded = pd.Series(label_encoder.fit_transform(y_train))
y_test_encoded = pd.Series(label_encoder.fit_transform(y_test))

In [58]:
# definicja modelu
model = LogisticRegression()
model.fit(x_train, y_train_encoded)

# predykcja modelu
y_predicted = model.predict(x_test)

# ewaluacja modelu
accuracy = accuracy_score(y_test_encoded, y_predicted)
precision = precision_score(y_test_encoded, y_predicted)
recall = recall_score(y_test_encoded, y_predicted)
f1 = f1_score(y_test_encoded, y_predicted)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.9478260869565217
Precision: 1.0
Recall: 0.45454545454545453
F1-Score: 0.625


In [59]:
# Add the predicted values as a new column to the test dataset
data_test_with_predictions = x_test.copy()  # Copy feature data
data_test_with_predictions['actual_value'] = y_test.values  # Add actual values
data_test_with_predictions['predicted_value'] = label_encoder.inverse_transform(y_predicted)  # Add predicted values

# Display the table for comparison
data_test_with_predictions

Unnamed: 0,0,1,2,3,actual_value,predicted_value
328,-1.041277,-0.132874,-0.362798,1.284615,0.0,0.0
140,-0.161073,-1.323708,-0.362798,-0.889760,0.0,0.0
486,-1.328796,-1.382856,0.384729,0.735304,0.0,0.0
380,1.749695,1.880838,-0.362798,0.208450,0.0,0.0
444,-1.538679,-1.157573,-0.362798,1.154485,0.0,0.0
...,...,...,...,...,...,...
6,-2.025717,-1.537374,5.523977,1.001727,1.0,0.0
7,-2.208021,-1.531993,5.742006,1.080055,1.0,0.0
8,-1.566312,-1.983450,3.768272,0.811268,1.0,0.0
9,-2.464397,-2.054822,7.899343,1.084376,1.0,1.0
