In [2]:
import pandas as pd
import numpy as np

train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

    PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
    HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
    CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
    Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    Destination - The planet the passenger will be debarking to.
    Age - The age of the passenger.
    VIP - Whether the passenger has paid for special VIP service during the voyage.
    RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
    Name - The first and last names of the passenger.
    Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [3]:
train_df_raw = pd.read_csv('train.csv')

In [4]:
train_df_raw.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
train_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
train_df_raw.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [7]:
train_df_raw.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [10]:
train_df_raw.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [13]:
for x in ['HomePlanet','Cabin','Destination','CryoSleep','VIP']:
    print(train_df_raw[x].value_counts())

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64
G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64
False    5439
True     3037
Name: CryoSleep, dtype: int64
False    8291
True      199
Name: VIP, dtype: int64


## Change data

- Change Binary vars to 0 and 1
- split out cabin

In [8]:
train_df = train_df_raw.copy()

In [9]:
train_df['CryoSleep'] = [1 if x == 'True' 
            else np.nan if pd.isnull(x)
            else 0 for x in train_df['CryoSleep']]
train_df['VIP'] = [1 if x == 'True' 
            else np.nan if pd.isnull(x)
            else 0 for x in train_df['VIP']]

In [20]:
train_df[['Cabin_Deck','Cabin_Num','Cabin_Side']] = train_df['Cabin'].str.split('/',expand=True)

In [21]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_Deck,Cabin_Num,Cabin_Side
0,0001_01,Europa,0.0,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,0.0,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,0.0,A/0/S,TRAPPIST-1e,58.0,0.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,0.0,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,0.0,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [23]:
for x in ['HomePlanet','Cabin_Deck','Cabin_Num','Cabin_Side','Destination','CryoSleep','VIP']:
    print(train_df[x].value_counts())

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64
F    2794
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: Cabin_Deck, dtype: int64
82      28
86      22
19      22
56      21
176     21
        ..
1644     1
1515     1
1639     1
1277     1
1894     1
Name: Cabin_Num, Length: 1817, dtype: int64
S    4288
P    4206
Name: Cabin_Side, dtype: int64
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64
0.0    8476
Name: CryoSleep, dtype: int64
0.0    8490
Name: VIP, dtype: int64


In [25]:
from sklearn.pipeline import make_pipeline

In [29]:
train_df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported', 'Cabin_Deck', 'Cabin_Num', 'Cabin_Side'],
      dtype='object')

In [26]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [39]:
X_raw = train_df[['HomePlanet', 'CryoSleep', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Deck', 'Cabin_Side']]

In [49]:
X = pd.get_dummies(X_raw,dummy_na=True)

In [38]:
X

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,...,Cabin_Deck_C,Cabin_Deck_D,Cabin_Deck_E,Cabin_Deck_F,Cabin_Deck_G,Cabin_Deck_T,Cabin_Deck_nan,Cabin_Side_P,Cabin_Side_S,Cabin_Side_nan
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0,...,0,0,0,0,0,0,0,1,0,0
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,True,1,...,0,0,0,1,0,0,0,0,1,0
2,0.0,58.0,0.0,43.0,3576.0,0.0,6715.0,49.0,False,0,...,0,0,0,0,0,0,0,0,1,0
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,False,0,...,0,0,0,0,0,0,0,0,1,0
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,True,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,41.0,0.0,0.0,6819.0,0.0,1643.0,74.0,False,0,...,0,0,0,0,0,0,0,1,0,0
8689,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,False,1,...,0,0,0,0,1,0,0,0,1,0
8690,0.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,True,1,...,0,0,0,0,1,0,0,0,1,0
8691,0.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,False,0,...,0,0,1,0,0,0,0,0,1,0


In [46]:
y = train_df['Transported'].astype(int)

In [41]:
import catboost

In [42]:
cat_clf = catboost.CatBoostClassifier()

In [56]:
cat_clf.fit(X,y)

Learning rate set to 0.025939
0:	learn: 0.6819903	total: 2.31ms	remaining: 2.3s
1:	learn: 0.6693851	total: 4.67ms	remaining: 2.33s
2:	learn: 0.6578058	total: 6.7ms	remaining: 2.23s
3:	learn: 0.6474794	total: 8.67ms	remaining: 2.16s
4:	learn: 0.6375946	total: 10.9ms	remaining: 2.17s
5:	learn: 0.6283664	total: 13.3ms	remaining: 2.21s
6:	learn: 0.6192852	total: 15.6ms	remaining: 2.22s
7:	learn: 0.6114952	total: 17.8ms	remaining: 2.21s
8:	learn: 0.6035008	total: 19.9ms	remaining: 2.19s
9:	learn: 0.5961308	total: 22.1ms	remaining: 2.19s
10:	learn: 0.5902560	total: 24.1ms	remaining: 2.17s
11:	learn: 0.5844630	total: 26.1ms	remaining: 2.15s
12:	learn: 0.5783664	total: 28.2ms	remaining: 2.14s
13:	learn: 0.5721057	total: 30.3ms	remaining: 2.13s
14:	learn: 0.5658045	total: 32.1ms	remaining: 2.1s
15:	learn: 0.5610596	total: 34.4ms	remaining: 2.12s
16:	learn: 0.5553345	total: 36.5ms	remaining: 2.11s
17:	learn: 0.5500725	total: 38.6ms	remaining: 2.1s
18:	learn: 0.5465825	total: 40.6ms	remaining: 2.

<catboost.core.CatBoostClassifier at 0x20f62f6ef10>

In [44]:
from sklearn.model_selection import cross_validate

In [50]:
cross_validate(cat_clf,X,y,scoring=['roc_auc','accuracy','precision','recall'])

Learning rate set to 0.023581
0:	learn: 0.6827305	total: 2.06ms	remaining: 2.06s
1:	learn: 0.6710542	total: 4.01ms	remaining: 2s
2:	learn: 0.6603769	total: 5.89ms	remaining: 1.96s
3:	learn: 0.6503104	total: 7.64ms	remaining: 1.9s
4:	learn: 0.6408715	total: 9.69ms	remaining: 1.93s
5:	learn: 0.6322971	total: 11.7ms	remaining: 1.94s
6:	learn: 0.6236064	total: 13.8ms	remaining: 1.95s
7:	learn: 0.6160257	total: 16ms	remaining: 1.98s
8:	learn: 0.6086271	total: 17.8ms	remaining: 1.96s
9:	learn: 0.6016605	total: 19.8ms	remaining: 1.96s
10:	learn: 0.5963453	total: 21.6ms	remaining: 1.94s
11:	learn: 0.5893942	total: 23.5ms	remaining: 1.94s
12:	learn: 0.5837156	total: 25.5ms	remaining: 1.94s
13:	learn: 0.5780429	total: 27.4ms	remaining: 1.93s
14:	learn: 0.5731300	total: 29.3ms	remaining: 1.93s
15:	learn: 0.5680371	total: 32ms	remaining: 1.97s
16:	learn: 0.5623792	total: 35ms	remaining: 2.02s
17:	learn: 0.5570448	total: 37.6ms	remaining: 2.05s
18:	learn: 0.5536032	total: 40.5ms	remaining: 2.09s
19

{'fit_time': array([1.97000003, 2.04000211, 1.98752999, 1.9885118 , 1.98199797]),
 'score_time': array([0.00499964, 0.00499916, 0.00399899, 0.0040009 , 0.00500226]),
 'test_roc_auc': array([0.87527381, 0.88178781, 0.89082234, 0.8944976 , 0.89526436]),
 'test_accuracy': array([0.77975848, 0.79700978, 0.80506038, 0.82048331, 0.80379747]),
 'test_precision': array([0.75757576, 0.77613516, 0.82002384, 0.83392645, 0.78525641]),
 'test_recall': array([0.82762557, 0.8390411 , 0.78538813, 0.80342857, 0.84      ])}

## Test

In [51]:
test_df = pd.read_csv('test.csv')

In [53]:
test_df['CryoSleep'] = [1 if x == 'True' 
            else np.nan if pd.isnull(x)
            else 0 for x in test_df['CryoSleep']]
test_df['VIP'] = [1 if x == 'True' 
            else np.nan if pd.isnull(x)
            else 0 for x in test_df['VIP']]

In [54]:
test_df[['Cabin_Deck','Cabin_Num','Cabin_Side']] = test_df['Cabin'].str.split('/',expand=True)

In [55]:
X_raw_test = train_df[['HomePlanet', 'CryoSleep', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_Deck', 'Cabin_Side']]
X_test = pd.get_dummies(X_raw,dummy_na=True)

In [57]:
cat_res = cat_clf.predict(X_raw_test)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="Europa": Cannot convert 'b'Europa'' to float