In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
train.head()

In [None]:
test.head()

In [None]:
train.describe(include = ['O'])

In [None]:
train.HomePlanet.value_counts()

In [None]:
train.Destination.value_counts()

In [None]:
train.info()

In [None]:
train.Age.max()

In [None]:
train.Destination.value_counts()

Посмотрим процентное соотношение между выжившими и не выжившими

In [None]:
trainplt = train.Transported.value_counts(normalize=True)
trainplt.plot(kind="bar")

In [None]:
print(trainplt)

Зависимость выживания от HomePlanet

In [None]:
sns.barplot(x="HomePlanet", y="Transported", data=train, errorbar=None)

Transported ~ CryoSleep

In [None]:
tab = pd.crosstab(train["CryoSleep"], train["Transported"])
tab.div(tab.sum(1), axis=0).plot(kind="bar", stacked=True)

Transported ~ Destination

In [None]:
sns.barplot(x="Destination", y="Transported", data=train, errorbar=None)

Transported ~ VIP

In [None]:
tab = pd.crosstab(train["VIP"], train["Transported"])
tab.div(tab.sum(1), axis=0).plot(kind="bar", stacked=True)

Разделим "Cabin" на "Deck", "CabNum" и "Board" 

In [None]:
trt = [train, test]
for t in trt:
    t[["Deck", "CabNum", "Board"]] = t.Cabin.str.split("/", expand=True)
train.head()

Посмотрим на распределение Board и Deck в тестовой и тренировочной выборках чтобы определить, чем заполнять nan

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 6))
train.Board.value_counts(normalize=True).plot(kind="bar", ax=ax[0, 0])
train.Deck.value_counts(normalize=True).plot(kind="bar", ax=ax[0, 1])
test.Board.value_counts(normalize=True).plot(kind="bar", ax=ax[1, 0])
test.Deck.value_counts(normalize=True).plot(kind="bar", ax=ax[1, 1])

посмотрим на зависимость вероятности выживания от Board

In [None]:
sns.barplot(x="Board", y="Transported", data=train, errorbar=None)

заменим P и S в Board на 0 и 1 (признак бинарный, OHE не требуется), заполним нули в Board рандомными значениями 0-1 т к распределение приблизительно равное и заполним нули в Deck модами (первой и второй)

In [None]:
board_mapping = {"P": 0, "S": 1}
deck_options = ["F", "G"]
for t in trt:
    t.Board = t.Board.map(board_mapping)
    t.loc[t.Board.isnull(), "Board"] = np.random.randint(0, 2, t.Board.isnull().sum())
    t.Board = t.Board.astype(int)
    t.loc[t.Deck.isnull(), "Deck"] = np.random.choice(deck_options, t.Deck.isnull().sum())
train.head()

заполним CabNum рандомными значениями между максимумом и минимумом в пределах группы значений с теми же Deck и Board, а также добавим значение CNB (CabNumBound), ее значение очевидно из гистограммы ниже

In [None]:
for t in trt:
    t.CabNum = t.CabNum.astype("Int64")
    t.loc[t.CabNum.isnull(), 'CabNum'] = t[t.CabNum.isnull()].apply(
    lambda x: np.random.randint(
        t[(t['Deck'] == x['Deck']) & (t['Board'] == x['Board'])]['CabNum'].min(),
        t[(t['Deck'] == x['Deck']) & (t['Board'] == x['Board'])]['CabNum'].max() + 1
    ), axis=1
    )
    t.CabNum = t.CabNum.astype(int)
    t["CNB"] = np.floor(t.CabNum/37.86)
    t.CNB = t.CNB.astype(int)
print(train.CabNum.isnull().sum())

In [None]:
tr_survived = train[train.Transported]
tr_notsur = train[~train.Transported]
fig, ax = plt.subplots(2, 1, figsize=(15, 16))
sns.histplot(tr_survived.CabNum, color="blue", bins=50, ax=ax[0])
h = sns.histplot(tr_notsur.CabNum, color="red", bins=50, ax=ax[0])
sns.violinplot(x="Deck", y="CabNum", hue="Transported", data=train, split=True, ax=ax[1])
print(h.patches[0].get_width())

"Визуализируем" с помощью 3хмерного графика зону поражения, выводов сделать не удалось(

In [None]:
deck_mapping = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "T": 7}
color_mapping = {True: "blue", False: "red"}
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(train.Board, train.CNB, train.Deck.map(deck_mapping), c=train.Transported.map(color_mapping))

посмотрим на зависимость между палубой и вероятностью выживания

In [None]:
sns.barplot(x="Deck", y="Transported", data=train, errorbar=None)

применим OHE к deck

In [None]:
train = pd.get_dummies(train, columns=["Deck"], drop_first=True)
test = pd.get_dummies(test, columns=["Deck"], drop_first=True)
train.head()

распределение кают

In [None]:
sns.histplot(x="CabNum", data=train, bins=50)

посмотрим, сколько человек было в группе с каждым пассажиром, сразу удалим ненужные столбцы:

In [None]:
train["GroupNum"] = train.PassengerId.str.extract("^(\d\d\d\d)")
train["PeopleNum"] = train.GroupNum.map(train.groupby("GroupNum").size())
train.drop(["CNB", "GroupNum"], axis=1, inplace=True)
test["GroupNum"] = test.PassengerId.str.extract("^(\d\d\d\d)")
test["PeopleNum"] = test.GroupNum.map(test.groupby("GroupNum").size())
test.drop(["CNB", "GroupNum"], axis=1, inplace=True)
with pd.option_context('display.max_columns', None):
    print(train.head())

зависимость выживания от числа пассажиров

In [None]:
sns.barplot(x="PeopleNum", y="Transported", data=train, errorbar=None)

введем столбец IsAlone

In [None]:
trt = [train, test]
for t in trt:
    t["IsAlone"] = 0
    t.loc[t["PeopleNum"] > 1, "IsAlone"] = 1
sns.barplot(x="IsAlone", y="Transported", data=train, errorbar=None)
print("%2.2f"%(train[train["IsAlone"] == 1].size/train.size*100))
print("%2.2f"%(train[train["IsAlone"] == 0].size/train.size*100))

заменим True False на 0 1

In [None]:
bool_mapping = {True: 1, False: 0}
for t in trt:
    t.VIP = t.VIP.map(bool_mapping)
    t.CryoSleep = t.CryoSleep.map(bool_mapping)
    t[["Deck_B", "Deck_C", "Deck_D", "Deck_E", "Deck_F", "Deck_G", "Deck_T"]] = t[["Deck_B", "Deck_C", "Deck_D", "Deck_E", "Deck_F", "Deck_G", "Deck_T"]].astype(int)
train.Transported = train.Transported.map(bool_mapping).astype(int)
with pd.option_context('display.max_columns', None):
    print(train.head())

распределение возрастов ~ стандартное, nan заменяем рандомными числами в пределах среднеквадратичного отклонения от среднего

In [None]:
sns.histplot(x="Age", data=train, bins=50)
train.Age.isnull().sum()

In [None]:
for t in trt:
    m = t.Age.mean()
    s = t.Age.std()
    c = t.Age.isnull().sum()
    rv = np.random.randint(m-s, m+s, c)
    t.loc[t.Age.isnull(), "Age"] = rv
    t.Age = t.Age.astype(int)

заполним HomePlanet и Destination модами и применим OHE

In [None]:
fig = plt.figure(figsize=(15, 8))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
train.HomePlanet.value_counts(normalize=True).plot(kind="bar", ax=ax1)
train.Destination.value_counts(normalize=True).plot(kind="bar", ax=ax2)
print(train.HomePlanet.isnull().sum())
print(train.Destination.isnull().sum())
for t in trt:
    t.fillna({"HomePlanet": "Earth"}, inplace=True)
    t.fillna({"Destination": "TRAPPIST-1e"}, inplace=True)
train = pd.get_dummies(train, columns=["HomePlanet", "Destination"], drop_first=True)
test = pd.get_dummies(test, columns=["HomePlanet", "Destination"], drop_first=True)
trt = [train, test]
for t in trt:
    t[["HomePlanet_Europa", "HomePlanet_Mars", "Destination_PSO J318.5-22", "Destination_TRAPPIST-1e"]] = t[["HomePlanet_Europa", "HomePlanet_Mars", "Destination_PSO J318.5-22", "Destination_TRAPPIST-1e"]].astype(int)
with pd.option_context('display.max_columns', None):
    print(train.head())
    print(test.head())

визуализируем моды VIP и CryoSleep, затем заполним ими nan

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

train.CryoSleep.value_counts(normalize=True).plot(kind="bar", ax=axes[0])
train.VIP.value_counts(normalize=True).plot(kind="bar", ax=axes[1])
print(train.CryoSleep.isnull().sum())
print(train.VIP.isnull().sum())

for t in trt:
    t.fillna({"CryoSleep": 0, "VIP": 0}, inplace=True)
    t.CryoSleep = t.CryoSleep.astype(int)
    t.VIP = t.VIP.astype(int)
train.head()

построим гистограммы частот значений финансовых признаков

In [None]:
fig, ax = plt.subplots(5,1,  figsize=(10, 10))
plt.subplots_adjust(top = 2)
for t in trt:
    t.fillna({"RoomService": 0, "FoodCourt": 0, "ShoppingMall": 0, "Spa": 0, "VRDeck": 0}, inplace=True)
sns.histplot(train['RoomService'], bins=50, ax=ax[0]);
sns.histplot(train['FoodCourt'], bins=50, ax=ax[1]);
sns.histplot(train['ShoppingMall'], bins=50, ax=ax[2]);
sns.histplot(train['Spa'], bins=50, ax=ax[3]);
sns.histplot(train['VRDeck'], bins=50, ax=ax[4]);

для этих признаков выведем матрицу корреляции

In [None]:
corr_matrix = train[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Transported"]].corr()
sns.heatmap(corr_matrix, annot=True)

In [None]:
fin_features = ["FoodCourt", "ShoppingMall", "RoomService", "Spa", "VRDeck"]
for t in trt:
    t[fin_features] = t[fin_features].astype(float)
train.head()

удалим Cabin и Name

In [None]:
for t in trt:
    t.drop(["Cabin", "Name"], axis=1, inplace=True)
train.head()
with pd.option_context('display.max_columns', None):
    print(train.head())
    print(test.head())

Построим матрицу корреляции для всех признаков

In [None]:
corr_matrix = train.corr()
plt.figure(figsize=(20, 8))
sns.heatmap(corr_matrix, annot=True)

Подготовим датасеты

In [None]:
x_train = train.drop(["PassengerId", "Transported"], axis=1)
y_train = train["Transported"]
x_test = test.drop("PassengerId", axis=1)

Обучим RandomForestClassifier выбрав для него лучшие гиперпараметры

In [None]:
clf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1, return_train_score=True)
grid_search.fit(x_train, y_train)
print("\nЛучшие параметры:", grid_search.best_params_)
print("Лучшая средняя точность на кросс-валидации:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_test = best_model.predict(x_test)
train_accuracy = best_model.score(x_train, y_train)
print("Точность на тренировочной выборке:", train_accuracy)

Сохраним решение

In [None]:
res = pd.DataFrame({
    "PassengerId": test.PassengerId,
    "Transported": list(map(bool, y_test))
})
res.head()

In [None]:
res.to_csv('submission.csv', index=False)