In [12]:
%pip install pandas matplotlib seaborn statsmodels

Note: you may need to restart the kernel to use updated packages.


In [13]:
import sys
import pandas as pd
import os

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report

In [14]:
# --- Set up the environment ---
test_filename = os.path.join(os.getcwd(), 'ds04/assets/Test_knight.csv')
train_filename = os.path.join(os.getcwd(), 'ds04/assets/Train_knight.csv')

In [15]:
# --- Load the data to pandas dataframes ---
test_df = pd.read_csv(test_filename)
train_df = pd.read_csv(train_filename)
train_df.set_index('knight')
train_df.head(3)

Unnamed: 0,Sensitivity,Hability,Strength,Power,Agility,Dexterity,Awareness,Prescience,Reactivity,Midi-chlorien,...,Evade,Stims,Sprint,Combo,Delay,Attunement,Empowered,Burst,Grasping,knight
0,11.8,17.26,75.26,431.9,0.09087,0.06232,0.02853,0.01638,0.1847,0.06019,...,24.49,86.0,562.0,0.1244,0.1726,0.1449,0.05356,0.2779,0.08121,Sith
1,19.81,22.15,130.0,1260.0,0.09831,0.1027,0.1479,0.09498,0.1582,0.05395,...,30.88,186.8,2398.0,0.1512,0.315,0.5372,0.2388,0.2768,0.07615,Jedi
2,10.97,17.2,71.73,371.5,0.08915,0.1113,0.09457,0.03613,0.1489,0.0664,...,26.87,90.14,476.4,0.1391,0.4082,0.4779,0.1555,0.254,0.09532,Sith


In [16]:
# --- Separate features and label ---
le = LabelEncoder()
scaler = StandardScaler()
X = train_df.drop(columns=['knight'])
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
y = train_df['knight']
y_encoded = le.fit_transform(y)  # 'Jedi' → 1, 'Sith' → 0 automatically

# --- set up the train validation and test ---
# X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
# X_test = test_df.copy()

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)
X_test = pd.DataFrame(scaler.fit_transform(test_df), columns=X.columns)
print(y_encoded[:5])
print(y[:5])

[1 0 1 1 1]
0    Sith
1    Jedi
2    Sith
3    Sith
4    Sith
Name: knight, dtype: object


In [17]:
# --- Define models ---
rf = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=42)
knn = KNeighborsClassifier(n_neighbors=15)
logreg = LogisticRegression(max_iter=500)

# --- Voting Classifier ---
voting = VotingClassifier(estimators=[
    ('rf', rf),
    ('knn', knn),
    ('logreg', logreg)
], voting='hard')

# --- Train and evaluate ---
voting.fit(X_train, y_train)
y_val_pred = voting.predict(X_val)
f1 = f1_score(y_val, y_val_pred)
print(f"F1-score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(y_val, y_val_pred, target_names=le.classes_))
print("train prediction: ", le.inverse_transform(y_val_pred)[:10])

F1-score: 0.9792

Classification Report:
               precision    recall  f1-score   support

        Jedi       1.00      0.94      0.97        33
        Sith       0.96      1.00      0.98        47

    accuracy                           0.97        80
   macro avg       0.98      0.97      0.97        80
weighted avg       0.98      0.97      0.97        80

train prediction:  ['Sith' 'Sith' 'Sith' 'Sith' 'Sith' 'Sith' 'Sith' 'Sith' 'Jedi' 'Jedi']


In [18]:
# --- Show model classification report ---
for name, model in voting.named_estimators_.items():
    y_pred_detail = model.predict(X_val)
    print(f"\n--- {name.upper()} ---")
    report = classification_report(y_val, y_pred_detail, target_names=le.classes_, output_dict=True)
    print('accuracy: ', report['accuracy'])
    print('f1-score:', report['macro avg']['f1-score'])


--- RF ---
accuracy:  0.975
f1-score: 0.9742101869761444

--- KNN ---
accuracy:  0.975
f1-score: 0.9739583333333333

--- LOGREG ---
accuracy:  0.975
f1-score: 0.9739583333333333


In [19]:
# --- Final training on full data ---
voting.fit(X_scaled, y_encoded)

# --- Predict on test set ---
y_test_pred = voting.predict(test_df)
predicted_labels = le.inverse_transform(y_test_pred)
print(y_test_pred[:10])
print(predicted_labels[:10])

[0 0 0 0 0 0 0 0 0 0]
['Jedi' 'Jedi' 'Jedi' 'Jedi' 'Jedi' 'Jedi' 'Jedi' 'Jedi' 'Jedi' 'Jedi']


In [20]:
# --- Export to Voting.txt ---
dir = os.path.join(os.getcwd(), 'ds04/ex06')
filename = os.path.join(dir, 'Voting.txt')

with open(filename, "w") as f:
    for label in predicted_labels:
        f.write(label + '\n')

print("Voting predictions saved to Voting.txt")

Voting predictions saved to Voting.txt
