In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
#first, let's check for if there's any missing data
train.isna().any()

PassengerId     False
HomePlanet       True
CryoSleep        True
Cabin            True
Destination      True
Age              True
VIP              True
RoomService      True
FoodCourt        True
ShoppingMall     True
Spa              True
VRDeck           True
Name             True
Transported     False
dtype: bool

In [2]:
#every column seems to be missing data except PassengerId and Transported
#let's make copies of test and train so we don't accidentally alter the original data during our cleaning process
train_copy = train.copy()
test_copy = test.copy()
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
#now that we've made copies to edit, let's start the cleaning process by finding common patterns first
#First, both children(under 13) and cryosleep have no bill. The exception to this is Maham, the first entry
#Let us remove the name column, as it is not really relevant 
train_copy.drop('Name',axis=1,inplace=True)
#now, we use cryosleep to deal with any missing costs
train_copy.RoomService = train_copy.RoomService.fillna(train_copy.RoomService.mean())
train_copy.loc[train_copy.CryoSleep == True, 'RoomService'] = 0

train_copy.FoodCourt = train_copy.FoodCourt.fillna(train_copy.FoodCourt.mean())
train_copy.loc[train_copy.CryoSleep == True, 'FoodCourt'] = 0

train_copy.ShoppingMall = train_copy.ShoppingMall.fillna(train_copy.ShoppingMall.mean())
train_copy.loc[train_copy.CryoSleep == True, 'ShoppingMall'] = 0

train_copy.Spa = train_copy.Spa.fillna(train_copy.Spa.mean())
train_copy.loc[train_copy.CryoSleep == True, 'Spa'] = 0

train_copy.VRDeck = train_copy.VRDeck.fillna(train_copy.VRDeck.mean())
train_copy.loc[train_copy.CryoSleep == True, 'VRDeck'] = 0

train_copy.isna().any()

PassengerId     False
HomePlanet       True
CryoSleep        True
Cabin            True
Destination      True
Age              True
VIP              True
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Transported     False
dtype: bool

In [4]:
#Now, we shall deal with cryosleep missing values, to do so, we shall make a temp column and a sum of expenses
train_copy['tempcryo'] = 0
train_copy['tempexpenses'] = train_copy[['RoomService', 'FoodCourt',
                                           'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
#if expenses is 0 that means they are probably in cryosleep
train_copy.loc[train_copy['tempexpenses'] == 0, 'tempcryo'] = 1
#now we add all cases where they were in cryosleep from the original column and when they were not
train_copy.loc[train_copy.CryoSleep.astype('str') == 'True', 'tempcryo'] = 1
train_copy.loc[train_copy.CryoSleep.astype('str') == 'False', 'tempcryo'] = 0
#we will convert this to boolean for easier use and replace the original cryosleep table
train_copy['CryoSleep'] = train_copy['tempcryo']
train_copy.drop('tempcryo',axis=1,inplace=True)
train_copy['CryoSleep'] = train_copy['CryoSleep'].astype('bool')
train_copy.isna().any()

PassengerId     False
HomePlanet       True
CryoSleep       False
Cabin            True
Destination      True
Age              True
VIP              True
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Transported     False
tempexpenses    False
dtype: bool

In [5]:
#now we shall deal with age, since we realistically can't guess ages, we shall just use median instead for null values
train_copy.Age = train_copy.Age.fillna(train_copy.Age.median())
train_copy.isna().any()

PassengerId     False
HomePlanet       True
CryoSleep       False
Cabin            True
Destination      True
Age             False
VIP              True
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Transported     False
tempexpenses    False
dtype: bool

In [6]:
#To deal with cabin, we shall use the next value to fill any missing values
train_copy['Cabin'] = train_copy.Cabin.fillna(method='backfill')
train_copy.isna().any()

PassengerId     False
HomePlanet       True
CryoSleep       False
Cabin           False
Destination      True
Age             False
VIP              True
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Transported     False
tempexpenses    False
dtype: bool

In [7]:
#Then, we deal with the remaining(Homeplanet and Destination) we shall fill in via mode due to simplicity
train_copy.HomePlanet.mode()

0    Earth
Name: HomePlanet, dtype: object

In [8]:
train_copy.Destination.mode()

0    TRAPPIST-1e
Name: Destination, dtype: object

In [9]:
train_copy.HomePlanet = train_copy.HomePlanet.fillna('Earth')
train_copy.Destination = train_copy.Destination.fillna('TRAPPIST-1e')
train_copy.isna().any()

PassengerId     False
HomePlanet      False
CryoSleep       False
Cabin           False
Destination     False
Age             False
VIP              True
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Transported     False
tempexpenses    False
dtype: bool

In [10]:
#Finally, we deal with VIP. Any value missing shall be considered false to smoothen the process.
train_copy.VIP = train_copy.VIP.fillna('False')
train_copy.VIP = train_copy.VIP.astype('bool')
train_copy.isna().any()

PassengerId     False
HomePlanet      False
CryoSleep       False
Cabin           False
Destination     False
Age             False
VIP             False
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
Transported     False
tempexpenses    False
dtype: bool

In [11]:
#Now that we have removed all missing values from train, we shall repeat the exact same process on test
test_copy.drop('Name',axis=1,inplace=True)
test_copy.RoomService = test_copy.RoomService.fillna(test_copy.RoomService.mean())
test_copy.loc[test_copy.CryoSleep == True, 'RoomService'] = 0
test_copy.FoodCourt = test_copy.FoodCourt.fillna(train_copy.FoodCourt.mean())
test_copy.loc[test_copy.CryoSleep == True, 'FoodCourt'] = 0
test_copy.ShoppingMall = test_copy.ShoppingMall.fillna(test_copy.ShoppingMall.mean())
test_copy.loc[test_copy.CryoSleep == True, 'ShoppingMall'] = 0
test_copy.Spa = test_copy.Spa.fillna(test_copy.Spa.mean())
test_copy.loc[test_copy.CryoSleep == True, 'Spa'] = 0
test_copy.VRDeck = test_copy.VRDeck.fillna(test_copy.VRDeck.mean())
test_copy.loc[test_copy.CryoSleep == True, 'VRDeck'] = 0
test_copy['tempcryo'] = 0
test_copy['tempexpenses'] = test_copy[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_copy.loc[test_copy['tempexpenses'] == 0, 'tempcryo'] = 1
test_copy.loc[test_copy.CryoSleep.astype('str') == 'True', 'tempcryo'] = 1
test_copy.loc[test_copy.CryoSleep.astype('str') == 'False', 'tempcryo'] = 0
test_copy['CryoSleep'] = test_copy['tempcryo']
test_copy.drop('tempcryo',axis=1,inplace=True)
test_copy['CryoSleep'] = test_copy['CryoSleep'].astype('bool')
test_copy.Age = test_copy.Age.fillna(train_copy.Age.median())
test_copy['Cabin'] = train_copy.Cabin.fillna(method='backfill')

In [12]:
test_copy.HomePlanet.mode()

0    Earth
Name: HomePlanet, dtype: object

In [13]:
test_copy.Destination.mode()

0    TRAPPIST-1e
Name: Destination, dtype: object

In [14]:
test_copy.HomePlanet = test_copy.HomePlanet.fillna('Earth')
test_copy.Destination = test_copy.Destination.fillna('TRAPPIST-1e')

In [15]:
test_copy.VIP = test_copy.VIP.fillna('False')
test_copy.VIP = test_copy.VIP.astype('bool')
test_copy.isna().any()

PassengerId     False
HomePlanet      False
CryoSleep       False
Cabin           False
Destination     False
Age             False
VIP             False
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
tempexpenses    False
dtype: bool

In [16]:
#Engineering the features Deck and Side to deal with the vast variance in Cabin values that causes getdummies issues 
train_copy['Deck'] = train_copy.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[0])
train_copy['Side'] = train_copy.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[2])
test_copy['Deck'] = test_copy.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[0])
test_copy['Side'] = test_copy.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[2])

In [17]:
#Now that we have removed all missing values from both, let us build the model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP',
            'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side']
X = pd.get_dummies(train_copy[features])
y = train_copy['Transported']

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X.values,y.values,random_state=1) 

In [19]:
#xgboost is great at handling data with lots of missing values, so it will be our pick of choice
import xgboost as xgb
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X,y)
xgb.score(X,y)

0.888185896698493

In [20]:
y_pred = xgb.predict(pd.get_dummies((test_copy[features])))

In [21]:
#Create the data frame containing our PassengerId and Transported predicted results side by side
result = pd.DataFrame({'PassengerId':test_copy.PassengerId, 'Transported': y_pred.astype('bool')})
result

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True


In [22]:
#Save result to csv format
result.to_csv('submission.csv',index=False)