In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [10]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
train['Age'].fillna(train['Age'].median(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)
test['Fare'].fillna(test['Fare'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object

In [12]:
# For training data
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)
train['Sex'] = train['Sex'].map({'male': 0, 'female': 1})

# For test data
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
test['IsAlone'] = (test['FamilySize'] == 1).astype(int)
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1})

In [13]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone']

X = train[features]
y = train['Survived']
X_test = test[features]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

clf = XGBClassifier(
    random_state=42,
    n_estimators=400,
    max_depth=12,
    learning_rate=0.01,
    scale_pos_weight=1.5,
    subsample=0.8,
    colsample_bytree=1,
)

clf.fit(X_train, y_train)

# Predikce na všech sadách
y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)  # ← TOTO TI CHYBĚLO!
y_test_pred = clf.predict(X_test)  # Jen pro submission

# Vyhodnocení (bez % - balanced_accuracy je 0-1)
print("=== Vyhodnocení na trénovací sadě ===")
print(f"Balanced accuracy na trénovací sadě: {balanced_accuracy_score(y_train, y_train_pred):.3f}")
print("Matice záměn:\n", confusion_matrix(y_train, y_train_pred))
print("Report:\n", classification_report(y_train, y_train_pred))

print("\n=== Vyhodnocení na validační sadě ===")
print(f"Balanced accuracy na validační sadě: {balanced_accuracy_score(y_val, y_val_pred):.3f}")
print("Matice záměn:\n", confusion_matrix(y_val, y_val_pred))
print("Report:\n", classification_report(y_val, y_val_pred))

# Testovací sada - jen pro informaci (nemáme y_test)
print(f"\n=== Predikce na testovací sadě ===")
print(f"Predikovaná míra přežití: {y_test_pred.mean():.3f}")
print(f"Celkem predikcí: {len(y_test_pred)}")

=== Vyhodnocení na trénovací sadě ===
Balanced accuracy na trénovací sadě: 0.918
Matice záměn:
 [[424  20]
 [ 32 236]]
Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94       444
           1       0.92      0.88      0.90       268

    accuracy                           0.93       712
   macro avg       0.93      0.92      0.92       712
weighted avg       0.93      0.93      0.93       712


=== Vyhodnocení na validační sadě ===
Balanced accuracy na validační sadě: 0.802
Matice záměn:
 [[89 16]
 [18 56]]
Report:
               precision    recall  f1-score   support

           0       0.83      0.85      0.84       105
           1       0.78      0.76      0.77        74

    accuracy                           0.81       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179


=== Predikce na testovací sadě ===
Predikovaná míra přežití: 0.395
Celkem predikcí: 418


In [15]:
feature_imp = pd.DataFrame({
    'feature': features,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_imp)

      feature  importance
1         Sex    0.679428
0      Pclass    0.127662
3       SibSp    0.057328
6  FamilySize    0.041459
5        Fare    0.032920
2         Age    0.032137
4       Parch    0.029066
7     IsAlone    0.000000


In [16]:
submission = pd.read_csv('submission.csv')

In [17]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [18]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q,1,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0000,,S,2,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q,1,1
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S,1,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",0,27.0,0,0,A.5. 3236,8.0500,,S,1,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,C,1,1
415,1307,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,1,1
416,1308,3,"Ware, Mr. Frederick",0,27.0,0,0,359309,8.0500,,S,1,1
