This worksheet will do a detailed imputation exercise to see how much it can change the score relative to using the mode for all missing values. 

# Data Loading

In [1]:
import pandas as pd
import numpy as np

from fastai.tabular.all import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


Below I wanted to see the characteristics of both the continuous and the categorical data. From the competition description RoomService, FoodCourt, ShoppingMall, Spa, and VRDeck are the amounts the passenger has billed at each of the Spaceship Titanic's many luxury amenities. Age has much smaller values than the other continuous values and I may want to look at evening these out in the future. 

In the categorical data there are a small number of categories for HomePlanet, CryoSleep, Destination and VIP. 

In [5]:
train.describe().round().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,8514.0,29.0,14.0,0.0,19.0,27.0,38.0,79.0
RoomService,8512.0,225.0,667.0,0.0,0.0,0.0,47.0,14327.0
FoodCourt,8510.0,458.0,1611.0,0.0,0.0,0.0,76.0,29813.0
ShoppingMall,8485.0,174.0,605.0,0.0,0.0,0.0,27.0,23492.0
Spa,8510.0,311.0,1137.0,0.0,0.0,0.0,59.0,22408.0
VRDeck,8505.0,305.0,1146.0,0.0,0.0,0.0,46.0,24133.0


In [6]:
train.describe(include=object).round().T

Unnamed: 0,count,unique,top,freq
PassengerId,8693,8693,0001_01,1
HomePlanet,8492,3,Earth,4602
CryoSleep,8476,2,False,5439
Cabin,8494,6560,G/734/S,8
Destination,8511,3,TRAPPIST-1e,5915
VIP,8490,2,False,8291
Name,8493,8473,Gollux Reedall,2


I want to add new columns for the group and size subcomponents in PassengerId, two new columns to separate out the deck and the side of the Cabin, a new columnn for last name, and a new column to sum up all the spending for the RoomService, FoodCourt, ShoppingMall, Spa and VRDeck columns.

In [7]:
train['Group'] = train['PassengerId'].str[0:4]
train['GroupSize'] = train.groupby('Group')['Group'].transform(len)
train[['Deck', 'Number', 'Side']] = train['Cabin'].str.split('/', expand=True)
splitted = train['Name'].str.split()
train['LastName'] = splitted.str[-1]
train['Spend'] = train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']

In [8]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Transported,Group,GroupSize,Deck,Number,Side,LastName,Spend
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,Maham Ofracculy,False,1,1,B,0,P,Ofracculy,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,44.0,Juanna Vines,True,2,1,F,0,S,Vines,736.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,49.0,Altark Susent,False,3,2,A,0,S,Susent,10383.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,193.0,Solam Susent,False,3,2,A,0,S,Susent,5176.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,2.0,Willy Santantines,True,4,1,F,1,S,Santantines,1091.0


I also want to do the same things on the test dataset

In [9]:
test['Group'] = test['PassengerId'].str[0:4]
test['GroupSize'] = test.groupby('Group')['Group'].transform(len)
test[['Deck', 'Number', 'Side']] = test['Cabin'].str.split('/', expand=True)
splitted = test['Name'].str.split()
test['LastName'] = splitted.str[-1]
test['Spend'] = test['RoomService'] + test['FoodCourt'] + test['ShoppingMall'] + test['Spa'] + test['VRDeck']

In [10]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,GroupSize,Deck,Number,Side,LastName,Spend
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,13,1,G,3,S,Carsoning,0.0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,18,1,F,4,S,Peckers,2832.0
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,19,1,C,0,S,Unhearfus,0.0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,21,1,C,1,S,Caltilter,7418.0
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,23,1,F,5,S,Harperez,645.0


# Data Imputation

We can see that there are about 200 each of missing data for all the parameters except PassengerID and Transported. 

In [11]:
train.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
Group             0
GroupSize         0
Deck            199
Number          199
Side            199
LastName        200
Spend           908
dtype: int64

In [12]:
test.isna().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
Group             0
GroupSize         0
Deck            100
Number          100
Side            100
LastName         94
Spend           455
dtype: int64

First I wanted to run the value counts of all the features, then I'd look for relationships to other features for guidance in fillin in NaN values. 

In [13]:
for col in train.columns:
    col_counts = train[col].value_counts()
    print(f'LENGTH: {len(col_counts)}\n',f'{col_counts}\n')

LENGTH: 8693
 0001_01    1
6136_01    1
6141_01    1
6139_06    1
6139_05    1
          ..
3126_01    1
3124_03    1
3124_02    1
3124_01    1
9280_02    1
Name: PassengerId, Length: 8693, dtype: int64

LENGTH: 3
 Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

LENGTH: 2
 False    5439
True     3037
Name: CryoSleep, dtype: int64

LENGTH: 6560
 G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64

LENGTH: 3
 TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64

LENGTH: 80
 24.0    324
18.0    320
21.0    311
19.0    293
23.0    292
       ... 
72.0      4
78.0      3
79.0      3
76.0      2
77.0      2
Name: Age, Length: 80, dtype: int64

LENGTH: 2
 False    8291
True      199
Name: VIP, dtype: int64

LENGTH: 1273
 0.0       5577
1.0        117
2.0         79
3.0         61


I'm also doing this for the test dataframe values to make sure the features have the same basic behavior as the train dataframe.  

In [14]:
for col in test.columns:
    col_counts = test[col].value_counts()
    print(f'LENGTH: {len(col_counts)}\n',f'{col_counts}\n')

LENGTH: 4277
 0013_01    1
6288_02    1
6269_01    1
6269_02    1
6269_03    1
          ..
3072_01    1
3074_01    1
3075_01    1
3077_01    1
9277_01    1
Name: PassengerId, Length: 4277, dtype: int64

LENGTH: 3
 Earth     2263
Europa    1002
Mars       925
Name: HomePlanet, dtype: int64

LENGTH: 2
 False    2640
True     1544
Name: CryoSleep, dtype: int64

LENGTH: 3265
 G/160/P     8
D/273/S     7
B/31/P      7
G/748/S     7
E/228/S     7
           ..
E/236/S     1
F/650/S     1
G/566/P     1
G/557/S     1
G/1498/S    1
Name: Cabin, Length: 3265, dtype: int64

LENGTH: 3
 TRAPPIST-1e      2956
55 Cancri e       841
PSO J318.5-22     388
Name: Destination, dtype: int64

LENGTH: 79
 18.0    176
22.0    163
19.0    162
20.0    160
24.0    158
       ... 
70.0      2
79.0      2
68.0      2
78.0      1
77.0      1
Name: Age, Length: 79, dtype: int64

LENGTH: 2
 False    4110
True       74
Name: VIP, dtype: int64

LENGTH: 842
 0.0       2726
1.0         68
2.0         34
3.0         28
4

There are some clear patterns for HomePlanet by Deck, with two exceptions. E is about 45% for Earth, 35% Mars and 20% for Europa. D is about 60/40 for Mars vs Earth. F 60/40 for Earth vs Mars. Both E and F deck are mostly going to Trappist-1e. D is mostly going to Trappist-1e, but the ratio isn't as high. 

So all NaNs with Decks A, B, C, T will go to Europa. All Decks with G will go to Earth. Any remaining NaNs will utilize the mode to fill. 

In [15]:
print(train.groupby(['HomePlanet','Deck'], dropna=False).size())

HomePlanet  Deck
Earth       E        395
            F       1614
            G       2498
            NaN       95
Europa      A        252
            B        766
            C        734
            D        186
            E        128
            T          4
            NaN       61
Mars        D        282
            E        330
            F       1110
            NaN       37
NaN         A          4
            B         13
            C         13
            D         10
            E         23
            F         70
            G         61
            T          1
            NaN        6
dtype: int64


In [16]:
missing_mask = train['HomePlanet'].isna()
mapping_dict = dict({'A': 'Europa', 'B': 'Europa', 'C': 'Europa', 'T': 'Europa', 'G': 'Earth'})
train.loc[missing_mask, 'HomePlanet'] = train.loc[missing_mask, 'Deck'].map(mapping_dict)

The above mapping exercise has just cleared out the NaNs in the decks in the mapping dictionary. The rest will be taken care of when I do a blanket mode fill for all the remaining open values for all the features. 

In [17]:
print(train.groupby(['HomePlanet','Deck'], dropna=False).size())

HomePlanet  Deck
Earth       E        395
            F       1614
            G       2559
            NaN       95
Europa      A        256
            B        779
            C        747
            D        186
            E        128
            T          5
            NaN       61
Mars        D        282
            E        330
            F       1110
            NaN       37
NaN         D         10
            E         23
            F         70
            NaN        6
dtype: int64


I need to do the same for the test dataframe. 

In [18]:
missing_mask = test['HomePlanet'].isna()
mapping_dict = dict({'A': 'Europa', 'B': 'Europa', 'C': 'Europa', 'T': 'Europa', 'G': 'Earth'})
test.loc[missing_mask, 'HomePlanet'] = test.loc[missing_mask, 'Deck'].map(mapping_dict)

In [19]:
print(test.groupby(['HomePlanet','Deck'], dropna=False).size())

HomePlanet  Deck
Earth       E        188
            F        812
            G       1222
            NaN       61
Europa      A         98
            B        362
            C        355
            D        110
            E         69
            T          6
            NaN       18
Mars        D        124
            E        178
            F        603
            NaN       20
NaN         D          8
            E         12
            F         30
            NaN        1
dtype: int64


For CryoSleep I couldn't find any patterns when I matched against other groups and it's almost 2:1 for False, so I'll just do the mode. 

In [20]:
print(train.groupby(['CryoSleep'], dropna=False).size())

CryoSleep
False    5439
True     3037
NaN       217
dtype: int64


Destination is mostly TRAPPIST-1e and there aren't any patterns I can find with other features. So I will use mode to fill. 

In [21]:
print(train.groupby(['Destination'], dropna=False).size())

Destination
55 Cancri e      1800
PSO J318.5-22     796
TRAPPIST-1e      5915
NaN               182
dtype: int64


Age is interesting. There are 178 datapoints at age 0 and 179 with NaNs. Does that mean there are 179 infants or are these also some kind of data errors? For simplicity, I'll assume that is correct. From the value counts above, the largest age groups are between 18 and 24 years old (about 20% of the total values). So there's a wide dispersion in age. The difference between the mean and median is not much, so I'll use the mean for imputation. 

In [22]:
print(train.groupby(['Age'], dropna=False).size())

Age
0.0     178
1.0      67
2.0      75
3.0      75
4.0      71
       ... 
76.0      2
77.0      2
78.0      3
79.0      3
NaN     179
Length: 81, dtype: int64


In [23]:
train['Age'].mean()

28.82793046746535

In [24]:
train['Age'].median()

27.0

In [25]:
train['Age'].fillna(train['Age'].mean(), inplace=True)

In [26]:
print(train.groupby(['Age'], dropna=False).size())

Age
0.0     178
1.0      67
2.0      75
3.0      75
4.0      71
       ... 
75.0      4
76.0      2
77.0      2
78.0      3
79.0      3
Length: 81, dtype: int64


I also need to do this for the test dataframe. 

In [27]:
test['Age'].fillna(test['Age'].mean(), inplace=True)

In [28]:
print(test.groupby(['Age'], dropna=False).size())

Age
0.0     82
1.0     27
2.0     35
3.0     34
4.0     20
        ..
74.0     2
75.0     2
77.0     1
78.0     1
79.0     2
Length: 80, dtype: int64


VIP is mostly false, so the mode can be used. 

In [29]:
print(train.groupby(['VIP'], dropna=False).size())

VIP
False    8291
True      199
NaN       203
dtype: int64


For all the spend categories below, the vast majority of the values are 0, so I can use the mode here. As a check for RoomService, the median is also 0. There seems to be a long tail of high end spending at the end, which is why the median jumps up to 225. 

In [30]:
print(train.groupby(['RoomService'], dropna=False).size())

RoomService
0.0        5577
1.0         117
2.0          79
3.0          61
4.0          47
           ... 
8243.0        1
8586.0        1
9920.0        1
14327.0       1
NaN         181
Length: 1274, dtype: int64


In [31]:
train['RoomService'].mean()

224.687617481203

In [32]:
train['RoomService'].median()

0.0

In [33]:
train['RoomService'].mode()

0    0.0
Name: RoomService, dtype: float64

In [34]:
print(train.groupby(['FoodCourt'], dropna=False).size())

FoodCourt
0.0        5456
1.0         116
2.0          75
3.0          53
4.0          53
           ... 
26830.0       1
27071.0       1
27723.0       1
29813.0       1
NaN         183
Length: 1508, dtype: int64


In [35]:
print(train.groupby(['ShoppingMall'], dropna=False).size())

ShoppingMall
0.0        5587
1.0         153
2.0          80
3.0          59
4.0          45
           ... 
10424.0       1
10705.0       1
12253.0       1
23492.0       1
NaN         208
Length: 1116, dtype: int64


In [36]:
print(train.groupby(['Spa'], dropna=False).size())

Spa
0.0        5324
1.0         146
2.0         105
3.0          53
4.0          46
           ... 
16139.0       1
16594.0       1
18572.0       1
22408.0       1
NaN         183
Length: 1328, dtype: int64


In [37]:
print(train.groupby(['VRDeck'], dropna=False).size())

VRDeck
0.0        5495
1.0         139
2.0          70
3.0          56
4.0          47
           ... 
17074.0       1
17306.0       1
20336.0       1
24133.0       1
NaN         188
Length: 1307, dtype: int64


In [38]:
print(train.groupby(['Spend'], dropna=False).size())

Spend
0.0        3247
336.0         1
394.0         2
395.0         1
397.0         1
           ... 
30478.0       1
31074.0       1
31076.0       1
35987.0       1
NaN         908
Length: 2117, dtype: int64


Name and LastName are interesting. They are both incredibly widely dispersed. Using the mean or mode really won't add much value. For simplicity and to not have them match any other name, I'll use 'Xxxxx Xxxxx' for Name imputation and 'Xxxxx' for LastName imputation. 

In [39]:
print(train.groupby(['Name'], dropna=False).size())

Name
Aard Curle             1
Aarjel Jaff            1
Aarjel Rhuba           1
Aark Ches              1
Aark Homin             1
                    ... 
Zosmas Ineedeve        1
Zosmas Mormonized      1
Zubeneb Flesping       1
Zubeneb Pasharne       1
NaN                  200
Length: 8474, dtype: int64


In [40]:
print(train.groupby(['LastName'], dropna=False).size())

LastName
Acobson         4
Acobsond        3
Adavisons       9
Adkinson        3
Admingried      4
             ... 
Yanton          7
Yatters         4
Yorkland       11
Youngrayes      3
NaN           200
Length: 2218, dtype: int64


In [41]:
train['Name'] = train['Name'].fillna('Xxxxx Xxxxx')
train['LastName'] = train['LastName'].fillna('Xxxxx')

In [42]:
print(train.groupby(['Name'], dropna=False).size())

Name
Aard Curle           1
Aarjel Jaff          1
Aarjel Rhuba         1
Aark Ches            1
Aark Homin           1
                    ..
Zosmark Unaasor      1
Zosmas Ineedeve      1
Zosmas Mormonized    1
Zubeneb Flesping     1
Zubeneb Pasharne     1
Length: 8474, dtype: int64


In [43]:
print(train.groupby(['LastName'], dropna=False).size())

LastName
Acobson         4
Acobsond        3
Adavisons       9
Adkinson        3
Admingried      4
             ... 
Xxxxx         200
Yanton          7
Yatters         4
Yorkland       11
Youngrayes      3
Length: 2218, dtype: int64


I need to do the same thing with the test dataframe. 

In [44]:
test['Name'] = test['Name'].fillna('Xxxxx Xxxxx')
test['LastName'] = test['LastName'].fillna('Xxxxx')

Deck, Number and Side are obviously related to Cabin. All have 199 NaN values. Ideally whatever is imputed for Deck, Number and Side can be rolled up into Cabin. But that might be pretty hard considering how dispersed the data is. For Cabin and Number the data is very dispersed. The Side, as can be expected is pretty much 50/50 between P and S. 60% of the Deck values are pretty much split evenly between F and G. 

So for imputation, I will impute 50% of Side each to P and to S. I will impute 50% each of Deck to F and to G. Since Number is still a categorical value and widely dispersed, I'll create a fill value of 'XXX' for the missing data. Then to make this as formal as possible, I'll fill the Cabin missing values of the combination of the Deck, Number and Side imputed values. 

In [45]:
print(train.groupby(['Cabin'], dropna=False).size())

Cabin
A/0/P       2
A/0/S       2
A/1/S       3
A/10/P      1
A/10/S      1
         ... 
T/1/P       1
T/2/P       1
T/2/S       1
T/3/P       1
NaN       199
Length: 6561, dtype: int64


In [46]:
print(train.groupby(['Deck'], dropna=False).size())

Deck
A       256
B       779
C       747
D       478
E       876
F      2794
G      2559
T         5
NaN     199
dtype: int64


In [47]:
print(train.groupby(['Number'], dropna=False).size())

Number
0        18
1        15
10       12
100      12
1000      6
       ... 
996       1
997       2
998       4
999       6
NaN     199
Length: 1818, dtype: int64


In [48]:
print(train.groupby(['Side'], dropna=False).size())

Side
P      4206
S      4288
NaN     199
dtype: int64


I'll use a random generator to separate the Deck missing values 50/50 into F and G. 

In [49]:
nans = train['Deck'].isna()
length = sum(nans)
replacement = random.choices(['F', 'G'], weights=[.5, .5], k=length)
train.loc[nans,'Deck'] = replacement

In [50]:
print(train.groupby(['Deck'], dropna=False).size())

Deck
A     256
B     779
C     747
D     478
E     876
F    2885
G    2667
T       5
dtype: int64


I'll do the same thing with Side feature. 

In [51]:
nans = train['Side'].isna()
length = sum(nans)
replacement = random.choices(['P', 'S'], weights=[.5, .5], k=length)
train.loc[nans,'Side'] = replacement

In [52]:
print(train.groupby(['Side'], dropna=False).size())

Side
P    4305
S    4388
dtype: int64


I'll add the 'XXX' value to the missing Number values. 

In [53]:
train['Number'] = train['Number'].fillna('XXX')

In [54]:
print(train.groupby(['Number'], dropna=False).size())

Number
0        18
1        15
10       12
100      12
1000      6
       ... 
996       1
997       2
998       4
999       6
XXX     199
Length: 1818, dtype: int64


Now I need to fill in the missing Cabin values by combining the newly filled in Deck, Number and Side values. 

In [55]:
train['Cabin'] = train['Cabin'].fillna(train['Deck'] + '/' + train['Number'] + '/' + train['Side'])

In [56]:
print(train.groupby(['Cabin'], dropna=False).size())

Cabin
A/0/P     2
A/0/S     2
A/1/S     3
A/10/P    1
A/10/S    1
         ..
T/0/P     1
T/1/P     1
T/2/P     1
T/2/S     1
T/3/P     1
Length: 6564, dtype: int64


I'll do the same things with the test dataframe. 

In [57]:
nans = test['Deck'].isna()
length = sum(nans)
replacement = random.choices(['F', 'G'], weights=[.5, .5], k=length)
test.loc[nans,'Deck'] = replacement

nans = test['Side'].isna()
length = sum(nans)
replacement = random.choices(['P', 'S'], weights=[.5, .5], k=length)
test.loc[nans,'Side'] = replacement

test['Number'] = test['Number'].fillna('XXX')

In [58]:
test['Cabin'] = test['Cabin'].fillna(test['Deck'] + '/' + test['Number'] + '/' + test['Side'])

Now lets see what the current missing values look like in the train and test dataframes. About half the HomePlanet is left, and the Cabin, Age, Name, LastName, Deck, Number and Side missing values have all been filled in. 

In [59]:
train.isna().sum()

PassengerId       0
HomePlanet      109
CryoSleep       217
Cabin             0
Destination     182
Age               0
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name              0
Transported       0
Group             0
GroupSize         0
Deck              0
Number            0
Side              0
LastName          0
Spend           908
dtype: int64

In [60]:
test.isna().sum()

PassengerId       0
HomePlanet       51
CryoSleep        93
Cabin             0
Destination      92
Age               0
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name              0
Group             0
GroupSize         0
Deck              0
Number            0
Side              0
LastName          0
Spend           455
dtype: int64

The final step is to use the mode to fill in the remaining missing values in both the train and test dataframes. 

In [61]:
modes = train.mode().iloc[0]
modes

PassengerId         0001_01
HomePlanet            Earth
CryoSleep             False
Cabin               G/XXX/S
Destination     TRAPPIST-1e
Age                    24.0
VIP                   False
RoomService             0.0
FoodCourt               0.0
ShoppingMall            0.0
Spa                     0.0
VRDeck                  0.0
Name            Xxxxx Xxxxx
Transported            True
Group                  0984
GroupSize               1.0
Deck                      F
Number                  XXX
Side                      S
LastName              Xxxxx
Spend                   0.0
Name: 0, dtype: object

In [62]:
train.fillna(modes, inplace=True)

In [63]:
train.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
Group           0
GroupSize       0
Deck            0
Number          0
Side            0
LastName        0
Spend           0
dtype: int64

In [64]:
modes_test = test.mode().iloc[0]
modes_test

PassengerId         0013_01
HomePlanet            Earth
CryoSleep             False
Cabin               G/XXX/P
Destination     TRAPPIST-1e
Age                    18.0
VIP                   False
RoomService             0.0
FoodCourt               0.0
ShoppingMall            0.0
Spa                     0.0
VRDeck                  0.0
Name            Xxxxx Xxxxx
Group                  0339
GroupSize               1.0
Deck                      F
Number                  XXX
Side                      S
LastName              Xxxxx
Spend                   0.0
Name: 0, dtype: object

In [65]:
test.fillna(modes_test, inplace=True)

In [66]:
test.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Group           0
GroupSize       0
Deck            0
Number          0
Side            0
LastName        0
Spend           0
dtype: int64

# Converting Categorical Data

I want to do the easiest separation of continuous and categorical variables possible, so I'm using cont_cat_split. 

In [67]:
cont,cat = cont_cat_split(train)

We can see that now the continuous and categorical columns are identified. 

In [68]:
cont

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Spend']

In [69]:
cat

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'VIP',
 'Name',
 'Transported',
 'Group',
 'GroupSize',
 'Deck',
 'Number',
 'Side',
 'LastName']

In [70]:
train[cat]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name,Transported,Group,GroupSize,Deck,Number,Side,LastName
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,False,Maham Ofracculy,False,0001,1,B,0,P,Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,False,Juanna Vines,True,0002,1,F,0,S,Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,True,Altark Susent,False,0003,2,A,0,S,Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,False,Solam Susent,False,0003,2,A,0,S,Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,False,Willy Santantines,True,0004,1,F,1,S,Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,True,Gravior Noxnuther,False,9276,1,A,98,P,Noxnuther
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,False,Kurta Mondalley,False,9278,1,G,1499,S,Mondalley
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,False,Fayey Connon,True,9279,1,G,1500,S,Connon
8691,9280_01,Europa,False,E/608/S,55 Cancri e,False,Celeon Hontichre,False,9280,2,E,608,S,Hontichre


Then I move through each of the categorical items and transform it into numbers using pd.factorize. 

In [71]:
for column in cat:
     train[column] = pd.factorize(train[column])[0]

In [72]:
train[cat].head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name,Transported,Group,GroupSize,Deck,Number,Side,LastName
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,1,0,0,1,1,1,0,1,0,1,1
2,2,0,0,2,0,1,2,0,2,1,2,0,1,2
3,3,0,0,2,0,0,3,0,2,1,2,0,1,2
4,4,1,0,3,0,0,4,1,3,0,1,1,1,3


And now I need to also convert the categories in the test dataset to numbers.

Next I break out the categorical data as test_cat and the continuous data as test_cont. Then I convert each column in test_cat into continuous data, since the ML model can only work with numbers. 

In [73]:
test_cont,test_cat = cont_cat_split(test)
for i in test_cat:
    test[i] = pd.factorize(test[i])[0]

In [74]:
test[test_cat].head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name,Group,GroupSize,Deck,Number,Side,LastName
0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,0,0,1,1,0,1,1,0,1
2,2,1,0,2,1,0,2,2,0,2,2,0,2
3,3,1,1,3,0,0,3,3,0,2,3,0,3
4,4,0,1,4,0,0,4,4,0,1,4,0,4


In [75]:
X = train.drop('Transported',axis=1)
y = train.Transported

In [76]:
rf_model = RandomForestClassifier(criterion='entropy', 
                                  n_estimators=446,
                                  min_samples_split=2,
                                  min_samples_leaf=7,
                                  oob_score=True,
                                  max_depth=57,
                                  random_state=1,
                                  max_features=None,
                                  n_jobs=-1)

In [77]:
X,y = shuffle(X,y, random_state=42)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [78]:
def get_score(model,X,y):
    n = cross_val_score(model,X,y,scoring ='accuracy',cv=20)
    return n

In [79]:
print(get_score(rf_model,X,y).mean())

0.7962733725303247


In [81]:
pred_rf_best = (rf_model.fit(X,y)).predict(test)
sample['Transported'] = pred_rf_best

#This converts the numbers to True/False values
#sample['Transported']=sample['Transported']>0.5
sample.to_csv('submit_rf_detailedimputation.csv', index=False)

Updating this with factorization vs categorical and using RFClassification vs RFRegression reduced the score the model score to 0.7963. Then using the later version of the submission created a Kaggle score of 0.40963. At this point, I decided not to spend anymore time on this. 