In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Transported'] = False
df = pd.concat([df_train, df_test], sort = False)
df.drop(['Name', 'PassengerId'], axis = 1, inplace = True)
df.head()

In [None]:
df.shape[0] == df_train.shape[0] + df_test.shape[0]

In [None]:
df.isna().sum()

In [None]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand = True)
df = df.drop(columns = ['Cabin'])
df.head()

In [None]:
df['Deck'] = df['Deck'].fillna('U')
df['Num'] = df['Num'].fillna(-1)
df['Side'] = df['Side'].fillna('U')
df.isna().sum()

In [None]:
df['Side'].value_counts()

In [None]:
df['Deck'] = df['Deck'].map({'F' : 1, 'G' : 0, 'E' : 2, 'B': 3, 'C' : 4, 'D' : 5, 'A' : 6, 'U' : 7, 'T' : 8})
df['Side'] = df['Side'].map({'S': 2, 'P' : 1, 'U' : -1})

In [None]:
impute_list = ['Age', 'VIP', 'Num', 'CryoSleep', 'Side', 'Deck', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
rest = list(set(df.columns) - set(impute_list))
df_rest = df[rest]
imp = KNNImputer()
df_imputed = imp.fit_transform(df[impute_list])
df_imputed = pd.DataFrame(df_imputed, columns = impute_list)
df = pd.concat([df_rest.reset_index(drop = True), df_imputed.reset_index(drop = True)], axis = 1)

In [None]:
df.isna().sum()

In [None]:
df['HomePlanet'] = df['HomePlanet'].fillna('U')
df['Destination'] = df['Destination'].fillna('U')
category_cols = ['HomePlanet', 'Destination']

for col in category_cols:
    df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)

In [None]:
df.isna().sum()

In [None]:
df = df.drop(columns = category_cols)

# Feature Engineering

In [None]:
bill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['AmtSpent'] = df[bill_cols].sum(axis = 1)
df['StdAmtSpent'] = df[bill_cols].std(axis = 1)
df['MeanAmtSpent'] = df[bill_cols].mean(axis = 1)

In [None]:
df.corr()['Transported'].sort_values(ascending = False)

In [None]:
df['3_high_cols'] = df['CryoSleep'] + df['HomePlanet_Europa'] + df['Destination_55 Cancri e']
df['3_low_cols'] = df['AmtSpent'] + df['MeanAmtSpent'] + df['HomePlanet_Earth']

In [None]:
df_train, df_test = df[:df_train.shape[0]], df[df_train.shape[0]:]
df_test = df_test.drop(columns = 'Transported')
df_train.shape, df_test.shape

# Model Selection

In [None]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X = df_train.drop(columns = 'Transported')
y = df_train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model_1 = XGBClassifier()
model_2 = DecisionTreeClassifier()
model_3 = RandomForestClassifier()
model_4 = LogisticRegression()
model_5 = LGBMClassifier()

In [None]:
model_1.fit(X_train, y_train)
pred = model_1.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
model_2.fit(X_train, y_train)
pred = model_2.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
model_3.fit(X_train, y_train)
pred = model_3.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
model_4.fit(X_train, y_train)
pred = model_4.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
model_5.fit(X_train, y_train)
pred = model_5.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
df_dummy = pd.read_csv('test.csv')

In [None]:
pred = model_5.predict(df_test)

final = pd.DataFrame()
final['PassengerID'] = df_dummy['PassengerId']
final['Transported'] = pred

final.to_csv('submission.csv', index = False)