In [1]:
import pandas as pd
import numpy as np

In [30]:
# Concat split and test df for researching and custom split
train = pd.read_csv('train.csv')

test = pd.read_csv('test.csv')
labels = pd.read_csv('gender_submission.csv')

test = test.merge(labels, on='PassengerId', how='left')

df = pd.concat([train, test]).reset_index(drop=True)

In [31]:
# Some preprocessing and feature eng

In [32]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna('S')

In [33]:
df['age_cat'] = pd.cut(df['Age'], bins=[0., 14., 24., 30., 44., np.inf], labels = [1,2,3,4,5])

In [34]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size = 0.2, random_state = 42)

for train_index, test_index in split.split(df, df['age_cat']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [35]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('age_cat', axis=1, inplace=True)

In [36]:
corr_matrix = strat_train_set.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1).corr()
corr_matrix['Survived'].sort_values()

Pclass     -0.291628
Age        -0.046647
SibSp      -0.014743
Parch       0.109316
Fare        0.251050
Survived    1.000000
Name: Survived, dtype: float64

In [37]:
# Тренировочный набор
X = strat_train_set.drop('Survived', axis=1)
y = strat_train_set['Survived'].copy()

In [91]:
# Тестовый набор
X = strat_test_set.drop('Survived', axis=1)
y = strat_test_set['Survived'].copy()

In [38]:
from sklearn.base import BaseEstimator, TransformerMixin


class Transformator(BaseEstimator, TransformerMixin):
    
    def __init__(self, child = True, alone=True):
        self.child = child
        self.alone = alone
        self.age_x = 4
        self.sibsp_x = 5
        self.parch_x = 6

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.values
        if self.child == True:
            child = (X[:, self.age_x] <= 16).astype(int)
            X = np.column_stack((X, child))
        if self.alone == True:
            alone = ((X[:, self.sibsp_x] == 0) & (X[:, self.parch_x] == 0)).astype(int)
            X = np.column_stack((X, alone))
        # if self.rel_d == True:
        #     X = np.delete(X, sibsp_x, axis=1)

        return X

class AgeFare(BaseEstimator, TransformerMixin):
    def __init__(self, fare_im = True):
        self.age_x = 4
        self.fare_x = 8
        self.pclass_x = 1
        self.fare_im = fare_im
        self.age_imputer = SimpleImputer(strategy='mean')
        self.age_scaler = StandardScaler()
        self.fare_scaler = StandardScaler()
        
    def fit(self, X, y=None):
        self.age_imputer.fit(X[:, [self.age_x]])
        self.age_scaler.fit(X[:, [self.age_x]])


        if self.fare_im == True:
            self.fare_scaler.fit(X[:, [self.fare_x]])
        
        return self

    def transform(self, X, y=None):
        X[:, self.age_x] = self.age_imputer.transform(X[:, [self.age_x]]).flatten()
        X[:, self.age_x] = self.age_scaler.transform(X[:, [self.age_x]]).flatten()

        if self.fare_im:
            mask = X[:, self.pclass_x] == 3

            fares = pd.to_numeric(X[mask, self.fare_x], errors='coerce')

            mean_fare = np.nanmean(fares)

            fares[np.isnan(fares)] = mean_fare

            X[mask, self.fare_x] = fares

            X[:, self.fare_x] = self.fare_scaler.transform(X[:, [self.fare_x]]).flatten()

        return X

class Remover(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pass_id = 0
        self.pclass = 1
        self.name = 2
        self.sex = 3
        self.sp = 5
        self.parch = 6
        self.ticket = 7
        self.cabin = 9
        self.emb = 10

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = np.delete(X, [self.pass_id, self.pclass, self.name, self.sex, self.sp, self.parch, self.ticket, self.cabin, self.emb], axis=1)

        return X

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('step_1', Transformator(child = False)),
    ('step_2', AgeFare()),
    ('step_3', Remover())
])

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

att_ss = ['Parch']
att_oe = ['Sex']
att_ohe = ['Pclass', 'Embarked']

pipe = ColumnTransformer([
    ('step_1', pipeline, list(X)),
    ('step_2', StandardScaler(), att_ss),
    ('step_3', OrdinalEncoder(), att_oe),
    ('step_4', OneHotEncoder(), att_ohe),
    
]
)

In [92]:
X_prepared = pipe.fit_transform(X)
X_prepared = np.delete(X_prepared, [5, 6, 8, 9, 10], axis = 1)

In [42]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

#### Логистическая регрессия

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()

model.fit(X_prepared, y)

In [44]:
predictions = model.predict(X_prepared)

In [45]:
accuracy_score(y, predictions)

0.8576886341929322

In [46]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression()

param_lin = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga', 'lbfgs'],  # 'liblinear' поддерживает только 'l1' и 'l2'
    'max_iter': [100, 200, 500]
}

grid_model = GridSearchCV(model, param_lin, cv=skf, scoring='accuracy', verbose=1)

In [47]:
grid_model.fit(X_prepared, y)

Fitting 10 folds for each of 216 candidates, totalling 2160 fits


900 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "/home/m-urv/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/m-urv/anaconda3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/m-urv/anaconda3/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver

In [48]:
print("Лучшие параметры:", grid_model.best_params_)
print("Лучшая точность:", grid_model.best_score_)

Лучшие параметры: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Лучшая точность: 0.8557875457875458


In [119]:
from sklearn.model_selection import cross_val_score

scores_log_reg = cross_val_score(grid_model.best_estimator_,
                          X_prepared,
                          y,
                          scoring='accuracy', cv=skf)

In [120]:
scores_log_reg.mean()

0.8591168091168091

#### Деревья решений (классификация)

In [55]:
from sklearn.ensemble import RandomForestClassifier

ran_for = RandomForestClassifier()
ran_for.fit(X_prepared, y)

In [56]:
from sklearn.metrics import r2_score

y_pred = ran_for.predict(X_prepared)
r2 = r2_score(y, y_pred)
acc = accuracy_score(y, y_pred)

In [57]:
r2

0.9386432361053594

In [58]:
acc

0.9856733524355301

In [59]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(ran_for, parameters, cv=skf, scoring='accuracy', verbose=1)

In [60]:
grid_search.fit(X_prepared, y)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits


In [61]:
print("Лучшие параметры:", grid_search.best_params_)
print("Лучшая точность:", grid_search.best_score_)

Лучшие параметры: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Лучшая точность: 0.8653479853479853


In [62]:
importances = ran_for.feature_importances_

In [121]:
from sklearn.model_selection import cross_val_score

scores_ran_for = cross_val_score(grid_search.best_estimator_,
                          X_prepared,
                          y,
                          scoring='accuracy', cv=skf)

In [122]:
scores_ran_for.mean()

0.8591168091168091

#### Метод опорных векторов

In [70]:
from sklearn.svm import SVC

In [71]:
svc = SVC(kernel = 'linear')

svc.fit(X_prepared, y)

In [72]:
pred = svc.predict(X_prepared)

In [73]:
accuracy_score(y, pred)

0.8538681948424068

In [74]:
svc = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]  # только если kernel='poly'
}

grid_svc = GridSearchCV(svc, param_grid, cv=skf, scoring='accuracy', verbose=1)

grid_svc.fit(X_prepared, y)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


In [75]:
print("Лучшие параметры:", grid_svc.best_params_)
print("Лучшая точность:", grid_svc.best_score_)

Лучшие параметры: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Лучшая точность: 0.8691575091575091


In [123]:
from sklearn.model_selection import cross_val_score

scores_svc = cross_val_score(grid_svc.best_estimator_,
                          X_prepared,
                          y,
                          scoring='accuracy', cv=skf)
scores_svc.mean()

0.8246438746438747

#### Градиентный бустинг

In [78]:
from sklearn.ensemble import GradientBoostingClassifier

In [79]:
GBC = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1, max_depth = 3)

GBC.fit(X_prepared, y)

In [80]:
pred_gbc = GBC.predict(X_prepared)
accuracy_score(y, pred_gbc)

0.9035339063992359

In [81]:
gbc = GradientBoostingClassifier()

param_grid_gbc = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

grid_gbc = GridSearchCV(gbc, param_grid_gbc, cv=skf, scoring='accuracy', verbose=1)

In [82]:
grid_gbc.fit(X_prepared, y)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits


In [83]:
print("Лучшие параметры:", grid_gbc.best_params_)
print("Лучшая точность:", grid_gbc.best_score_)

Лучшие параметры: {'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 2, 'n_estimators': 100}
Лучшая точность: 0.8691941391941391


In [124]:
from sklearn.model_selection import cross_val_score

scores_gbc = cross_val_score(grid_gbc.best_estimator_,
                          X_prepared,
                          y,
                          scoring='accuracy', cv=skf)
scores_gbc.mean()

0.8437321937321938

##### Выбор финальной модели

In [116]:
print(f'TRAIN: Cross validation accuracy RFC: {round(scores_ran_for.mean(),3)}')
print(f'TRAIN: Cross validation accuracy LogR: {round(scores_log_reg.mean(), 3)}')
print(f'TRAIN: Cross validation accuracy SVC: {round(scores_svc.mean(), 3)}')
print(f'TRAIN: Cross validation accuracy GBC: {round(scores_gbc.mean(), 3)}')

TRAIN: Cross validation accuracy RFC: 0.862
TRAIN: Cross validation accuracy LogR: 0.856
TRAIN: Cross validation accuracy SVC: 0.869
TRAIN: Cross validation accuracy GBC: 0.867


In [125]:
print(f'TEST: Cross validation accuracy RFC: {round(scores_ran_for.mean(),3)}')
print(f'TEST: Cross validation accuracy LogR: {round(scores_log_reg.mean(),3)}')
print(f'TEST: Cross validation accuracy SVC: {round(scores_svc.mean(), 3)}')
print(f'TEST: Cross validation accuracy GBC: {round(scores_gbc.mean(), 3)}')

TEST: Cross validation accuracy RFC: 0.859
TEST: Cross validation accuracy LogR: 0.859
TEST: Cross validation accuracy SVC: 0.825
TEST: Cross validation accuracy GBC: 0.844


In [145]:
final_model_RFC = grid_search.best_estimator_

import joblib

joblib.dump(final_model_RFC, 'RFC.pkl')

['RFC.pkl']

In [144]:
final_model_LG = grid_model.best_estimator_

import joblib

joblib.dump(final_model_LG, 'LG.pkl')

['LG.pkl']

In [106]:
# Тренировочный набор
X = strat_train_set.drop('Survived', axis=1)
y = strat_train_set['Survived'].copy()

In [117]:
# Тестовый набор
X = strat_test_set.drop('Survived', axis=1)
y = strat_test_set['Survived'].copy()

In [118]:
X_prepared = pipe.fit_transform(X)
X_prepared = np.delete(X_prepared, [5, 6, 8, 9, 10], axis = 1)

In [138]:
lg_pred = grid_model.best_estimator_.predict(X_prepared)
lg_pred_proba = grid_model.best_estimator_.predict_proba(X_prepared)

In [139]:
accuracy_score(y, lg_pred)

0.8549618320610687

In [140]:
rf_pred = grid_search.best_estimator_.predict(X_prepared)
rf_pred_proba = grid_search.best_estimator_.predict_proba(X_prepared)

In [141]:
accuracy_score(y, rf_pred)

0.8625954198473282

In [142]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print(classification_report(y, lg_pred))
print("ROC AUC score:", roc_auc_score(y, lg_pred_proba[:, 1]))

# Матрица ошибок
conf_matrix = confusion_matrix(y, lg_pred)
print("Confusion Matrix:\n", conf_matrix)

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       157
           1       0.85      0.78      0.81       105

    accuracy                           0.85       262
   macro avg       0.85      0.84      0.85       262
weighted avg       0.85      0.85      0.85       262

ROC AUC score: 0.8584470730967545
Confusion Matrix:
 [[142  15]
 [ 23  82]]


In [143]:
print(classification_report(y, rf_pred))
print("ROC AUC score:", roc_auc_score(y, rf_pred_proba[:, 1]))

# Матрица ошибок
conf_matrix = confusion_matrix(y, rf_pred)
print("Confusion Matrix:\n", conf_matrix)

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       157
           1       0.86      0.78      0.82       105

    accuracy                           0.86       262
   macro avg       0.86      0.85      0.85       262
weighted avg       0.86      0.86      0.86       262

ROC AUC score: 0.8751592356687898
Confusion Matrix:
 [[144  13]
 [ 23  82]]
