In [448]:
import tqdm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [450]:
from sklearn.linear_model import LogisticRegression

Считаем данные

In [453]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')
submission = pd.read_csv('submission_example.csv')

In [455]:
data_train['Relatives'] = data_train['SibSp'] + data_train['Parch']
data_test['Relatives'] = data_test['SibSp'] + data_test['Parch']

In [457]:
data_train = data_train.drop(columns=['SibSp', 'Parch'])
data_test = data_test.drop(columns=['SibSp', 'Parch'])

Удалим из датасетов данные, которые точно никак не могут повлиять на целевую переменную. (Оставим PasssengerId в data_test, так как это понадобится нам при формировании искомого ответа)

In [460]:
data_train = data_train.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
data_test = data_test.drop(columns=['Name', 'Ticket', 'Cabin'])

Применим one-hot-encoding к категориальным признакам

In [463]:
data_train = pd.get_dummies(data_train, columns=['Embarked', 'Sex', 'Pclass'], dtype=int)
data_test = pd.get_dummies(data_test, columns=['Embarked', 'Sex', 'Pclass'], dtype=int)

In [465]:
data_train['Age'] = data_train['Age'].fillna(data_train[~data_train['Age'].isna()]['Age'].mean())
data_train['Fare'] = data_train['Fare'].fillna(data_train[~data_train['Fare'].isna()]['Fare'].mean())
data_test['Age'] = data_test['Age'].fillna(data_test[~data_test['Age'].isna()]['Age'].mean())
data_test['Fare'] = data_test['Fare'].fillna(data_test[~data_test['Fare'].isna()]['Fare'].mean())

Удалим один из столбцов для каждого категриального признака, ведь по значению остальных можно однозначно определить значение, которое будет в удалённом столбце. Например, если Sex_male = 0, то Sex_female точно = 1.

In [468]:
data_train = data_train.drop(columns=['Sex_female', 'Embarked_C', 'Pclass_1'])
data_test = data_test.drop(columns=['Sex_female', 'Embarked_C', 'Pclass_1'])

Нормализуем числовые данные

In [471]:
mn1, mx1 = data_train['Age'].min(), data_train['Age'].max()
data_train['Age'] = (data_train['Age'] - mn1) / (mx1 - mn1)
data_test['Age'] = (data_test['Age'] - mn1) / (mx1 - mn1)

mn2, mx2 = data_train['Fare'].min(), data_train['Fare'].max()
data_train['Fare'] = (data_train['Fare'] - mn2) / (mx2 - mn2)
data_test['Fare'] = (data_test['Fare'] - mn2) / (mx2 - mn2)

In [473]:
display(data_train, data_test, submission)

Unnamed: 0,Survived,Age,Fare,Relatives,Embarked_Q,Embarked_S,Sex_male,Pclass_2,Pclass_3
0,0,0.271174,0.014151,1,0,1,1,0,1
1,1,0.472229,0.139136,1,0,0,0,0,0
2,1,0.321438,0.015469,0,0,1,0,0,1
3,1,0.434531,0.103644,1,0,1,0,0,0
4,0,0.434531,0.015713,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...
886,0,0.334004,0.025374,0,0,1,1,1,0
887,1,0.233476,0.058556,0,0,1,0,0,0
888,0,0.367921,0.045771,3,0,1,0,0,1
889,1,0.321438,0.058556,0,0,0,1,0,0


Unnamed: 0,PassengerId,Age,Fare,Relatives,Embarked_Q,Embarked_S,Sex_male,Pclass_2,Pclass_3
0,892,0.428248,0.015282,0,1,0,1,0,1
1,893,0.585323,0.013663,1,0,1,0,0,1
2,894,0.773813,0.018909,0,1,0,1,1,0
3,895,0.334004,0.016908,0,0,1,1,0,1
4,896,0.271174,0.023984,2,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
413,1305,0.375127,0.015713,0,0,1,1,0,1
414,1306,0.484795,0.212559,0,0,0,0,0,0
415,1307,0.478512,0.014151,0,0,1,1,0,1
416,1308,0.375127,0.015713,0,0,1,1,0,1


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [475]:
y_train = data_train['Survived']
X_train = data_train.drop(columns=['Survived'])
X_test = data_test.drop(columns=['PassengerId'])

In [477]:
model = LogisticRegression(max_iter=99999999, random_state=228)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

Формируем ответ

In [480]:
submission = pd.DataFrame({
        "PassengerId": data_test["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('submission.csv', index=False)

In [482]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
