In [207]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn import preprocessing    
le = preprocessing.LabelEncoder()

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [208]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df  = pd.read_csv('../input/spaceship-titanic/test.csv')

In [209]:
train_df.head(5)

In [210]:
train_df.shape

In [211]:
train_df.info()

In [212]:
train_df.describe()

In [213]:
n = train_df.nunique(axis=0)  
print("No.of.unique values in each column :\n", n)

In [214]:
sns.histplot(x='Age', hue='Transported', data=train_df, bins=20, multiple="stack", kde=True)

In [215]:
sns.catplot(y="Age", x="HomePlanet", hue="Destination", kind="bar", data=train_df)

In [216]:
sns.countplot(x="HomePlanet",hue="Transported", data=train_df)

In [217]:
sns.countplot(x="Transported", data=train_df)

In [218]:
sns.countplot(x="Destination",hue="Transported", data=train_df)

In [219]:
sns.histplot(x="HomePlanet",hue="Destination", data=train_df, multiple="stack", kde=True)

In [220]:
sns.countplot(x="CryoSleep",hue="Transported", data=train_df)

In [221]:
sns.countplot(x="VIP",hue="Transported", data=train_df)

In [222]:
sns.histplot(data=train_df, x="Age", binwidth=5,multiple="stack", kde=True)

In [223]:
sns.scatterplot(data=train_df, x='Age', y='RoomService',hue="Transported",legend="full")

In [224]:
sns.scatterplot(data=train_df, x='Age', y='Spa',hue="Transported",legend="full")

In [225]:
sns.scatterplot(data=train_df, x='Age', y='VRDeck',hue="Transported",legend="full")

In [226]:
sns.boxplot(x='FoodCourt',data=train_df)

In [227]:
train_df.loc[train_df['FoodCourt'] > 20000, 'FoodCourt'] = train_df.loc[train_df['FoodCourt'] < 20000, 'FoodCourt'].mean()

In [228]:
sns.boxplot(x='Transported', y='ShoppingMall',data=train_df)

In [229]:
train_df.loc[train_df['ShoppingMall'] > 10000, 'ShoppingMall'] = train_df.loc[train_df['ShoppingMall'] < 10000, 'ShoppingMall'].mean()

In [230]:
sns.boxplot(x='Spa',data=train_df)

In [231]:
train_df.loc[train_df['Spa'] > 20000, 'Spa'] = train_df.loc[train_df['Spa'] < 20000, 'Spa'].mean()

In [232]:
sns.boxplot(x='VRDeck',data=train_df)

In [233]:
train_df.loc[train_df['VRDeck'] > 20000, 'VRDeck'] = train_df.loc[train_df['VRDeck'] < 20000, 'VRDeck'].mean()

In [234]:
sns.boxplot(x='RoomService',data=train_df)

In [235]:
train_df.loc[train_df['RoomService'] > 10000, 'RoomService'] = train_df.loc[train_df['RoomService'] < 10000, 'RoomService'].mean()

In [236]:
train_df['HomePlanet']= train_df['HomePlanet'].fillna('Earth')
test_df['HomePlanet']= test_df['HomePlanet'].fillna('Earth')

In [237]:
train_df['Destination']= train_df['Destination'].fillna('TRAPPIST-1e')
test_df['Destination']= test_df['Destination'].fillna('TRAPPIST-1e')

In [238]:
train_df['Destination'] = train_df['Destination'].str.replace("TRAPPIST-1e","TRAPPIST")
train_df['Destination'] = train_df['Destination'].str.replace("55 Cancri e","Cancri")
train_df['Destination'] = train_df['Destination'].str.replace("PSO J318.5-22","PSO")
#easy names xD
test_df['Destination'] = test_df['Destination'].str.replace("TRAPPIST-1e","TRAPPIST")
test_df['Destination'] = test_df['Destination'].str.replace("55 Cancri e","Cancri")
test_df['Destination'] = test_df['Destination'].str.replace("PSO J318.5-22","PSO")

In [239]:
train_df['CryoSleep']= train_df['CryoSleep'].fillna(False)
test_df['CryoSleep']= test_df['CryoSleep'].fillna(False)

In [240]:
train_df['VIP']= train_df['VIP'].fillna(False)
test_df['VIP']= test_df['VIP'].fillna(False)

In [241]:
train_df['Age'].mean()

In [242]:
train_df['Age']= train_df['Age'].fillna(29)
test_df['Age']= test_df['Age'].fillna(29)

In [243]:
train_df['Spa']= train_df['Spa'].fillna(train_df['Spa'].mean())
test_df['Spa']= test_df['Spa'].fillna(test_df['Spa'].mean())

In [244]:
train_df['FoodCourt']= train_df['FoodCourt'].fillna(train_df['FoodCourt'].mean())
test_df['FoodCourt']= test_df['FoodCourt'].fillna(test_df['FoodCourt'].mean())

In [245]:
train_df['ShoppingMall']= train_df['ShoppingMall'].fillna(train_df['ShoppingMall'].mean())
test_df['ShoppingMall']= test_df['ShoppingMall'].fillna(test_df['ShoppingMall'].mean())

In [246]:
train_df['RoomService']= train_df['RoomService'].fillna(train_df['RoomService'].mean())
test_df['RoomService']= test_df['RoomService'].fillna(test_df['RoomService'].mean())

In [247]:
train_df['VRDeck']= train_df['VRDeck'].fillna(train_df['VRDeck'].mean())
test_df['VRDeck']= test_df['VRDeck'].fillna(test_df['VRDeck'].mean())

In [248]:
def converting_column(df,col,convertType):
    if convertType=='OHE':
        one_hot = pd.get_dummies(df[col],prefix=col)
        df = df.drop(col,axis = 1)
        df = df.join(one_hot)
        return df
    else:
        df[col] = le.fit_transform(df[col])
        return df

In [249]:
train_df = converting_column(train_df,'HomePlanet','OHE')
test_df = converting_column(test_df,'HomePlanet','OHE')

In [250]:
train_df = converting_column(train_df,'Destination','FTR')
test_df = converting_column(test_df,'Destination','FTR')

In [251]:
train_df = converting_column(train_df,'CryoSleep','FTR')
test_df = converting_column(test_df,'CryoSleep','FTR')

In [252]:
train_df = converting_column(train_df,'VIP','FTR')
test_df = converting_column(test_df,'VIP','FTR')

In [253]:
def CabinExtract(df):
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)    
    df['Num'] = df['Num'].astype(float)
    
    df['Deck']= df['Deck'].fillna('G')
    df['Num']= df['Num'].fillna(df['Num'].mean())
    df['Side']= df['Side'].fillna('S')
    
             
    df['Deck'] = le.fit_transform(df['Deck'])
    df['Side'] = le.fit_transform(df['Side'])
    
    return df

In [254]:
train_df = CabinExtract(train_df)
test_df = CabinExtract(test_df)

In [255]:
def findFamilyId(df):
    df[['Passenger_Group_Id', 'Passenger_Num']] = df['PassengerId'].str.split('_', expand=True)
    df['Passenger_Group_Id'] = df['Passenger_Group_Id'].astype(int)
    df['Passenger_Num'] = df['Passenger_Num'].astype(int)
    df['Passenger_Cnt'] = df.groupby('Passenger_Group_Id')['Passenger_Num'].transform('max')
    return df

In [256]:
train_df = findFamilyId(train_df)
test_df = findFamilyId(test_df)

In [257]:
train_df = train_df.drop(['Cabin','Name'], axis=1)
test_df = test_df.drop(['Cabin','Name'], axis=1)

In [258]:
train_df = train_df.drop(['PassengerId'], axis=1)

In [259]:
train_df = train_df.drop(['Passenger_Num','Passenger_Group_Id','VIP','Deck','HomePlanet_Mars'], axis=1)
test_df = test_df.drop(['Passenger_Num','Passenger_Group_Id','VIP','Deck','HomePlanet_Mars'], axis=1)

In [260]:
train_df['Under15'] = train_df['Age'].apply(lambda x: 1 if x < 15 else 0)
test_df['Under15'] = test_df['Age'].apply(lambda x: 1 if x < 15 else 0)
train_df = train_df.drop(['Age'], axis=1)
test_df = test_df.drop(['Age'], axis=1)

In [261]:
train_df.head(5)

In [262]:
X = train_df.drop("Transported", axis=1)
y = train_df["Transported"]

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [263]:
X_test  = test_df.drop("PassengerId", axis=1).copy()
X_train.shape, y_train.shape,X_valid.shape, y_valid.shape , X_test.shape

In [264]:
# params = {
# 'n_estimators': range(1, 10),
# 'max_depth': range(3, 9),
# 'learning_rate': [.2, .3, .4, .5, .6],
# 'colsample_bytree': [.7, .8, .9, 1]
# }
# xgb = XGBClassifier()
# g_search = GridSearchCV(estimator = xgb, param_grid = params,
# cv = 3, n_jobs = -1, verbose = 0, return_train_score=True)
# g_search.fit(X_train, y_train)
# print(g_search.best_params_)

In [265]:
xgb_model = XGBClassifier(colsample_bytree=0.8, n_estimators= 7,learning_rate=0.5,subsample= 0.6, min_child_weight=1, max_depth=4, objective='binary:logistic')
mod = xgb_model.fit(X_train, y_train,eval_metric='logloss')

print("Performance on train data:", mod.score(X_train, y_train))

y_pred_v = xgb_model.predict(X_valid)

cm = confusion_matrix(y_valid, y_pred_v) 
print ("Confusion Matrix : \n", cm)

In [266]:
y_pred_v

In [267]:
y_pred_x=mod.predict(X_test)


In [268]:
PassengerId = test_df['PassengerId']
submission = pd.DataFrame({'PassengerId':PassengerId,'Transported':y_pred_x})

In [269]:
submission.to_csv('submission.csv',index=False)