### Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

Destination - The planet the passenger will be debarking to.

Age - The age of the passenger.

VIP - Whether the passenger has paid for special VIP service during the voyage.

RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

Name - The first and last names of the passenger.

Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action = 'ignore')

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
df.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [4]:
df.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
df[df.duplicated()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported


In [6]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [7]:
df['pass_group'] = df.PassengerId.apply(lambda x: x.split('_')[0])
df['pp'] = df.PassengerId.apply(lambda x: x.split('_')[1])

In [8]:
df


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,pass_group,pp
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,01
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,01
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,01
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,02
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,01
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,01
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,01
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,01


In [9]:
df['L_name'] = df['Name'].apply(lambda x: x.split()[1] if isinstance(x, str) and len(x.split()) > 1 else '')

In [10]:
df.groupby('HomePlanet')['pass_group'].count()

HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: pass_group, dtype: int64

In [11]:
df[df.HomePlanet.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,pass_group,pp,L_name
59,0064_02,,True,E/3/S,TRAPPIST-1e,33.0,False,0.0,0.0,,0.0,0.0,Colatz Keen,True,0064,02,Keen
113,0119_01,,False,A/0/P,TRAPPIST-1e,39.0,False,0.0,2344.0,0.0,65.0,6898.0,Batan Coning,False,0119,01,Coning
186,0210_01,,True,D/6/P,55 Cancri e,24.0,False,0.0,0.0,,0.0,0.0,Arraid Inicont,True,0210,01,Inicont
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False,0242,01,Sté
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,0251,01,Amsive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8515,9084_01,,False,E/582/P,TRAPPIST-1e,25.0,False,1258.0,0.0,22.0,19.0,0.0,Jurs Mone,False,9084,01,Mone
8613,9194_01,,False,E/603/S,55 Cancri e,53.0,False,0.0,4017.0,0.0,13.0,3147.0,,False,9194,01,
8666,9248_01,,False,F/1792/S,55 Cancri e,38.0,,28.0,1208.0,973.0,207.0,0.0,Gian Perle,True,9248,01,Perle
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False,9257,01,Apple


In [12]:
mode = df.groupby('L_name')['HomePlanet'].apply(lambda x: x.mode().values[0] if not x.mode().empty else x.values[0])
mode

L_name
               Earth
Acobson        Earth
Acobsond       Earth
Adavisons      Earth
Adkinson       Earth
               ...  
Wynneyerson    Earth
Yanton         Earth
Yatters        Earth
Yorkland       Earth
Youngrayes     Earth
Name: HomePlanet, Length: 2218, dtype: object

mode = df.groupby('pass_group')['HomePlanet'].apply(lambda x: x.mode().values[0] if not x.mode().empty else x.values[0])
mode

In [13]:
mv_idx = df.HomePlanet.isnull()

In [14]:
df.loc[mv_idx, 'HomePlanet'] = df.loc[mv_idx, 'L_name'].apply(lambda x:mode[x])

mode = df.groupby('L_name')['HomePlanet'].apply(lambda x: x.mode().values[0] if not x.mode().empty else x.values[0])
mode

df.loc[mv_idx, 'HomePlanet'] = df.loc[mv_idx, 'L_name'].apply(lambda x :mode[x])

In [15]:
df[df.HomePlanet.isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,pass_group,pp,L_name
59,0064_02,,True,E/3/S,TRAPPIST-1e,33.0,False,0.0,0.0,,0.0,0.0,Colatz Keen,True,64,2,Keen
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False,242,1,Sté
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True,251,1,Amsive
807,0853_01,,True,A/9/S,55 Cancri e,38.0,False,0.0,0.0,0.0,0.0,0.0,Hamelik Ageurante,True,853,1,Ageurante
2631,2817_01,,False,F/584/P,TRAPPIST-1e,25.0,False,237.0,0.0,910.0,0.0,12.0,Sealfs Sutty,False,2817,1,Sutty
5252,5603_01,,False,E/365/S,TRAPPIST-1e,34.0,False,170.0,1256.0,0.0,3926.0,7121.0,Kocha Cluitty,False,5603,1,Cluitty
5634,5989_01,,False,F/1141/S,TRAPPIST-1e,20.0,False,0.0,0.0,,703.0,0.0,Darrie Holcompton,False,5989,1,Holcompton
6644,7006_01,,True,G/1142/S,PSO J318.5-22,16.0,False,0.0,0.0,0.0,0.0,0.0,Tammyl Fuenton,True,7006,1,Fuenton


In [16]:
# Calculate the mode of the 'HomePlanet' column
mode_value = df['HomePlanet'].mode().iloc[0]

# Fill missing values with the mode
df['HomePlanet'] = df['HomePlanet'].fillna(mode_value)


In [17]:
df.HomePlanet.value_counts()

HomePlanet
Earth     4719
Europa    2171
Mars      1803
Name: count, dtype: int64

In [18]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,pass_group,pp,L_name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0001,01,Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,0002,01,Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,0003,01,Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,0003,02,Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,0004,01,Santantines
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,01,Noxnuther
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,01,Mondalley
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,01,Connon
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,01,Hontichre


In [19]:
df.CryoSleep.fillna(df.CryoSleep.mode()[0], inplace = True)

In [20]:
df[['Deck', 'num', 'Side']] = df.Cabin.str.split('/', expand = True)

In [21]:
df.drop(['PassengerId', 'Cabin', 'Name'], axis = 1, inplace = True)

In [22]:
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,pass_group,pp,L_name,Deck,num,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,0001,01,Ofracculy,B,0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,0002,01,Vines,F,0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0003,01,Susent,A,0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0003,02,Susent,A,0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,0004,01,Santantines,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,9276,01,Noxnuther,A,98,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,9278,01,Mondalley,G,1499,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,9279,01,Connon,G,1500,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,9280,01,Hontichre,E,608,S


In [23]:
df[df.Deck.isnull()]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,pass_group,pp,L_name,Deck,num,Side
15,Earth,False,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,False,0012,01,Pooles,,,
93,Mars,True,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,True,0101,01,Trad,,,
103,Europa,False,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,False,0110,01,Aloubtled,,,
222,Mars,False,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,False,0239,01,Resty,,,
227,Mars,True,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,True,0244,01,Sad,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,Europa,False,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,,False,8772,02,Motled,,,
8475,Europa,False,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,False,9057,01,Statch,,,
8485,Europa,True,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,True,9069,03,Brakeng,,,
8509,Earth,True,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,False,9081,03,Clemondsey,,,


In [24]:
df[df.L_name == '']

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,pass_group,pp,L_name,Deck,num,Side
27,Mars,False,TRAPPIST-1e,21.0,False,980.0,2.0,69.0,0.0,0.0,False,0022,01,,D,0,P
58,Mars,True,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,True,0064,01,,F,14,S
65,Earth,False,TRAPPIST-1e,42.0,False,887.0,0.0,9.0,6.0,0.0,True,0069,01,,F,16,S
77,Mars,False,TRAPPIST-1e,8.0,False,0.0,0.0,0.0,0.0,0.0,True,0082,03,,F,16,P
101,Earth,False,TRAPPIST-1e,31.0,False,562.0,0.0,326.0,0.0,0.0,False,0108,02,,G,19,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8629,Europa,True,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,True,9205,02,,B,300,P
8631,Earth,True,TRAPPIST-1e,35.0,False,0.0,0.0,0.0,0.0,0.0,True,9208,01,,G,1485,S
8636,Europa,True,55 Cancri e,43.0,False,0.0,0.0,0.0,0.0,0.0,True,9218,01,,B,353,S
8652,Europa,False,TRAPPIST-1e,36.0,True,0.0,5600.0,715.0,2868.0,971.0,True,9230,01,,C,342,S


In [25]:
mode = df.groupby('L_name')['Deck'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Deck.isnull()
df.loc[mv_idx, 'Deck'] = df.loc[mv_idx, 'L_name'].apply(lambda x: mode[x])

In [26]:
mode = df.groupby('L_name')['num'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.num.isnull()
df.loc[mv_idx, 'num'] = df.loc[mv_idx, 'L_name'].apply(lambda x: mode[x])


In [27]:
mode = df.groupby('L_name')['Side'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Side.isnull()
df.loc[mv_idx, 'Side'] = df.loc[mv_idx, 'L_name'].apply(lambda x: mode[x])

In [28]:
mode = df.groupby('HomePlanet')['Deck'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Deck.isnull()
df.loc[mv_idx, 'Deck'] = df.loc[mv_idx, 'HomePlanet'].apply(lambda x: mode[x])
mode = df.groupby('HomePlanet')['num'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.num.isnull()
df.loc[mv_idx, 'num'] = df.loc[mv_idx, 'HomePlanet'].apply(lambda x: mode[x])
mode = df.groupby('HomePlanet')['Side'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Side.isnull()
df.loc[mv_idx, 'Side'] = df.loc[mv_idx, 'HomePlanet'].apply(lambda x: mode[x])



In [29]:
df[df.Destination.isnull()]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,pass_group,pp,L_name,Deck,num,Side
47,Mars,True,,19.0,False,0.0,0.0,0.0,0.0,0.0,True,0045,02,Chmad,F,10,P
128,Earth,False,,34.0,False,0.0,22.0,0.0,564.0,207.0,False,0138,02,Gambs,E,5,P
139,Earth,False,,41.0,False,0.0,0.0,0.0,0.0,607.0,False,0152,01,Estron,F,32,P
347,Earth,False,,23.0,False,348.0,0.0,0.0,4.0,368.0,False,0382,01,Floydendley,G,64,P
430,Earth,True,,50.0,False,0.0,0.0,0.0,0.0,0.0,False,0462,01,Sosanturney,G,67,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8372,Earth,True,,20.0,False,0.0,0.0,0.0,0.0,0.0,True,8956,02,Bonnondry,G,1453,P
8551,Mars,True,,41.0,False,0.0,0.0,0.0,0.0,0.0,True,9130,01,Corte,F,1765,S
8616,Mars,True,,33.0,False,0.0,0.0,0.0,0.0,0.0,True,9195,02,Purle,F,1779,S
8621,Europa,False,,41.0,True,0.0,7964.0,0.0,3238.0,5839.0,False,9197,02,Platch,C,308,P


In [30]:
mode = df.groupby('HomePlanet')['Destination'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Destination.isnull()
df.loc[mv_idx, 'Destination'] = df.loc[mv_idx, 'HomePlanet'].apply(lambda x: mode[x])

In [31]:
df[df.Age.isnull()]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,pass_group,pp,L_name,Deck,num,Side
50,Earth,False,TRAPPIST-1e,,False,4.0,0.0,2.0,4683.0,0.0,False,0052,01,Hubbarton,G,6,S
64,Mars,False,TRAPPIST-1e,,False,793.0,0.0,2.0,253.0,0.0,False,0068,01,Binie,E,4,S
137,Earth,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,True,0149,01,Hubbarrison,G,27,S
181,Europa,False,55 Cancri e,,False,0.0,2433.0,,878.0,443.0,True,0202,02,Embleng,A,2,P
184,Europa,False,55 Cancri e,,False,2.0,1720.0,12.0,1125.0,122.0,True,0206,01,Brugashed,C,9,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8274,Earth,True,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,False,8835,01,Bartines,G,1425,S
8301,Europa,True,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,True,8862,03,Myling,C,329,S
8374,Earth,False,TRAPPIST-1e,,False,194.0,1.0,10.0,629.0,0.0,False,8956,04,Bonnondry,G,1453,P
8407,Earth,True,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,True,8988,01,Fowlesterez,G,1448,S


In [32]:
df.Deck.unique()

array(['B', 'F', 'A', 'G', 'E', 'D', 'C', 'T'], dtype=object)

In [33]:
df.loc[df.Age == 0, 'Age'] = df.loc[df.Age == 0, 'Age'].replace(0,np.nan)

In [34]:
mean = df.groupby(['Side', 'Deck'])['Age'].transform('mean')
df['Age'] = df['Age'].fillna(mean)
mean

0       33.299451
1       28.977528
2       35.214815
3       35.214815
4       28.977528
          ...    
8688    36.149123
8689    24.756801
8690    24.756801
8691    30.587302
8692    30.587302
Name: Age, Length: 8693, dtype: float64

In [35]:
df[df.VIP.isnull()]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,pass_group,pp,L_name,Deck,num,Side
38,Earth,False,55 Cancri e,15.000000,,0.0,492.0,48.0,20.0,401.0,False,0036,01,Leodger,F,8,S
102,Earth,False,TRAPPIST-1e,24.756801,,0.0,0.0,0.0,0.0,0.0,True,0108,03,Handertiz,G,19,S
145,Mars,True,TRAPPIST-1e,35.000000,,0.0,0.0,0.0,0.0,0.0,True,0165,01,Anche,F,37,P
228,Mars,True,55 Cancri e,14.000000,,0.0,0.0,0.0,0.0,0.0,True,0244,02,Sad,F,47,S
566,Mars,False,TRAPPIST-1e,32.650794,,43.0,152.0,182.0,1.0,2005.0,False,0593,01,Kra,D,24,P
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8494,Earth,True,TRAPPIST-1e,24.756801,,0.0,0.0,,0.0,0.0,True,9074,01,Trerady,G,1460,S
8512,Earth,False,PSO J318.5-22,16.000000,,0.0,0.0,761.0,0.0,0.0,False,9081,06,Clemondsey,F,1858,P
8542,Earth,True,55 Cancri e,55.000000,,0.0,0.0,0.0,0.0,0.0,False,9122,01,Schmondez,G,1469,S
8630,Europa,True,TRAPPIST-1e,52.000000,,0.0,0.0,0.0,0.0,0.0,True,9205,03,Brakeng,B,300,P


In [36]:
mode = df.groupby('L_name')['VIP'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.VIP.isnull()
df.loc[mv_idx, 'VIP'] = df.loc[mv_idx, 'L_name'].apply(lambda x: mode[x])

In [37]:
df[df.VIP.isnull()]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,pass_group,pp,L_name,Deck,num,Side
1134,Mars,False,TRAPPIST-1e,28.0,,42.0,1116.0,0.0,0.0,0.0,True,1198,1,Tie,F,124,P
1464,Europa,True,55 Cancri e,51.0,,0.0,0.0,0.0,0.0,0.0,True,1548,1,Unicting,B,50,P
1687,Europa,True,TRAPPIST-1e,26.0,,0.0,0.0,0.0,0.0,0.0,True,1796,1,Hariourcal,C,67,S
5244,Europa,False,55 Cancri e,36.0,,380.0,686.0,0.0,295.0,1247.0,False,5599,1,Caming,E,364,S
6067,Europa,False,TRAPPIST-1e,65.0,,0.0,1420.0,,85.0,827.0,False,6413,1,Undrude,A,77,S
6978,Europa,False,55 Cancri e,41.0,,0.0,10049.0,214.0,5287.0,1845.0,False,7416,1,Burcaling,B,278,S
7386,Europa,True,55 Cancri e,43.0,,0.0,0.0,0.0,0.0,0.0,True,7897,1,Ancontaked,C,295,S


In [38]:
mode = df.groupby(['Deck'])['VIP'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.VIP.isnull()
df.loc[mv_idx, 'VIP'] = df.loc[mv_idx, 'Deck'].apply(lambda x: mode[x])

In [39]:
mean_RMS = df.groupby(['VIP', 'Deck'])['RoomService'].transform('mean')
df['RoomService'] = df['RoomService'].fillna(mean_RMS)
mean_FC = df.groupby(['VIP', 'Deck'])['FoodCourt'].transform('mean')
df['FoodCourt'] = df['FoodCourt'].fillna(mean_FC)
mean_SM = df.groupby(['VIP', 'Deck'])['ShoppingMall'].transform('mean')
df['ShoppingMall'] = df['ShoppingMall'].fillna(mean_SM)
mean_SPA = df.groupby(['VIP', 'Deck'])['Spa'].transform('mean')
df['Spa'] = df['Spa'].fillna(mean_SPA)
mean_VRD = df.groupby(['VIP', 'Deck'])['VRDeck'].transform('mean')
df['VRDeck'] = df['VRDeck'].fillna(mean_VRD)

In [40]:
df['Total_bill'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis = 1)


In [41]:
df.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'L_name'], axis = 1, inplace = True)

In [42]:
df['pass_group'] = df['pass_group'].astype('int')
df['pp'] = df['pp'].astype('int')
df['num'] = df['num'].astype('int')

In [43]:
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,pass_group,pp,Deck,num,Side,Total_bill
0,Europa,False,TRAPPIST-1e,39.0,False,False,1,1,B,0,P,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,True,2,1,F,0,S,736.0
2,Europa,False,TRAPPIST-1e,58.0,True,False,3,1,A,0,S,10383.0
3,Europa,False,TRAPPIST-1e,33.0,False,False,3,2,A,0,S,5176.0
4,Earth,False,TRAPPIST-1e,16.0,False,True,4,1,F,1,S,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,False,9276,1,A,98,P,8536.0
8689,Earth,True,PSO J318.5-22,18.0,False,False,9278,1,G,1499,S,0.0
8690,Earth,False,TRAPPIST-1e,26.0,False,True,9279,1,G,1500,S,1873.0
8691,Europa,False,55 Cancri e,32.0,False,False,9280,1,E,608,S,4637.0


In [44]:
df.HomePlanet.replace({'Europa':0, 'Earth':1, 'Mars':2}, inplace = True)
df.Destination.replace({'TRAPPIST-1e':0, '55 Cancri e':1, 'PSO J318.5-22':2}, inplace = True)
df.Deck.replace({'B':0, 'F':1, 'A':2, 'G':3, 'E':4, 'D':5, 'C':6, 'T':7}, inplace = True)
df.Side.replace({'P':0, 'S':1}, inplace = True)


In [45]:
df.CryoSleep = df.CryoSleep.astype(int)
df.VIP = df.VIP.astype(int)

In [46]:
df

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,pass_group,pp,Deck,num,Side,Total_bill
0,0,0,0,39.0,0,False,1,1,0,0,0,0.0
1,1,0,0,24.0,0,True,2,1,1,0,1,736.0
2,0,0,0,58.0,1,False,3,1,2,0,1,10383.0
3,0,0,0,33.0,0,False,3,2,2,0,1,5176.0
4,1,0,0,16.0,0,True,4,1,1,1,1,1091.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,0,1,41.0,1,False,9276,1,2,98,0,8536.0
8689,1,1,2,18.0,0,False,9278,1,3,1499,1,0.0
8690,1,0,0,26.0,0,True,9279,1,3,1500,1,1873.0
8691,0,0,1,32.0,0,False,9280,1,4,608,1,4637.0


In [47]:
df.drop(['pass_group', 'pp'], axis = 1 ,inplace = True)

In [48]:
df1 = pd.get_dummies(df).astype(int)
df1

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,Deck,num,Side,Total_bill
0,0,0,0,39,0,0,0,0,0,0
1,1,0,0,24,0,1,1,0,1,736
2,0,0,0,58,1,0,2,0,1,10383
3,0,0,0,33,0,0,2,0,1,5176
4,1,0,0,16,0,1,1,1,1,1091
...,...,...,...,...,...,...,...,...,...,...
8688,0,0,1,41,1,0,2,98,0,8536
8689,1,1,2,18,0,0,3,1499,1,0
8690,1,0,0,26,0,1,3,1500,1,1873
8691,0,0,1,32,0,0,4,608,1,4637


In [49]:
x = df1.drop('Transported', axis = 1)
y = df1['Transported']

In [50]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 42)

In [51]:
cols = train_x.columns
cols

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'Deck', 'num',
       'Side', 'Total_bill'],
      dtype='object')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
train_x_scaled = pd.DataFrame(train_x_scaled, columns=cols)
test_x_scaled = scaler.transform(test_x)
test_x_scaled = pd.DataFrame(test_x_scaled, columns=cols)
test_x_scaled


In [52]:
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.metrics import accuracy_score
logreg = LogReg()
logreg.fit(train_x, train_y)
train_predict = logreg.predict(train_x)
train_predict
k = accuracy_score(train_predict, train_y)
print('Training accuracy_score', k )
test_predict = logreg.predict(test_x)
k = accuracy_score(test_predict, test_y)
print('Test accuracy_score    ', k )

Training accuracy_score 0.7212762693664673
Test accuracy_score     0.7258509659613616


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score


In [54]:
print(classification_report(train_y,train_predict))
print(classification_report(test_y,test_predict))

              precision    recall  f1-score   support

           0       0.67      0.85      0.75      3233
           1       0.80      0.59      0.68      3286

    accuracy                           0.72      6519
   macro avg       0.74      0.72      0.72      6519
weighted avg       0.74      0.72      0.72      6519

              precision    recall  f1-score   support

           0       0.68      0.84      0.75      1082
           1       0.79      0.61      0.69      1092

    accuracy                           0.73      2174
   macro avg       0.74      0.73      0.72      2174
weighted avg       0.74      0.73      0.72      2174



In [55]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0).fit(train_x, train_y)
train_predict=tree.predict(train_x)
k = accuracy_score(train_predict, train_y)
print('Training accuracy_score', k )
test_predict = tree.predict(test_x)
k = accuracy_score(test_predict, test_y)
print('Test accuracy_score    ', k )

Training accuracy_score 0.9996932044792146
Test accuracy_score     0.6780128794848206


In [56]:
print(classification_report(train_y,train_predict))
print(classification_report(test_y,test_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3233
           1       1.00      1.00      1.00      3286

    accuracy                           1.00      6519
   macro avg       1.00      1.00      1.00      6519
weighted avg       1.00      1.00      1.00      6519

              precision    recall  f1-score   support

           0       0.68      0.67      0.67      1082
           1       0.68      0.69      0.68      1092

    accuracy                           0.68      2174
   macro avg       0.68      0.68      0.68      2174
weighted avg       0.68      0.68      0.68      2174



In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(max_depth=6, random_state=42)
clf.fit(train_x, train_y)
train_predict=clf.predict(train_x)
k = accuracy_score(train_predict, train_y)
print('Training accuracy_score', k )
test_predict = clf.predict(test_x)
k = accuracy_score(test_predict, test_y)
print('Test accuracy_score    ', k )

Training accuracy_score 0.7571713452983586
Test accuracy_score     0.7410303587856486


In [58]:
print(classification_report(train_y,train_predict))
print(classification_report(test_y,test_predict))

              precision    recall  f1-score   support

           0       0.72      0.84      0.78      3233
           1       0.81      0.67      0.74      3286

    accuracy                           0.76      6519
   macro avg       0.77      0.76      0.76      6519
weighted avg       0.77      0.76      0.76      6519

              precision    recall  f1-score   support

           0       0.71      0.82      0.76      1082
           1       0.78      0.67      0.72      1092

    accuracy                           0.74      2174
   macro avg       0.75      0.74      0.74      2174
weighted avg       0.75      0.74      0.74      2174



importances = clf.feature_importances_
feature_names = train_x.columns

# Print the feature importances
print("Feature Importances:")
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance}")

# Creating a dataframe for visualization
final_df = pd.DataFrame({'Features':feature_names,"Importances":importances})
final_df.set_index('Features',inplace=True)
sorted_importances = final_df.sort_values(by = 'Importances')
sorted_importances.plot(kind='barh')
plt.show()

sorted_importances[sorted_importances.values>= 0.06340066399872543]

new_X=x[sorted_importances[sorted_importances.values>=0.06340066399872543].index]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train,X_test,y_train,y_test=train_test_split(new_X,y,test_size=0.2,random_state=42)


clf = RandomForestClassifier(max_depth=5, random_state=42)
clf.fit(X_train, y_train)
train_predict=clf.predict(X_train)
k = accuracy_score(train_predict, y_train)
print('Training accuracy_score', k )
test_predict = clf.predict(X_test)
k = accuracy_score(test_predict, y_test)
print('Test accuracy_score    ', k )
importances = clf.feature_importances_
feature_names = train_x.columns

# Print the feature importances
print("Feature Importances:")
for feature, importance in zip(feature_names, importances):
    print(f"{feature}: {importance}")

# Creating a dataframe for visualization
final_df = pd.DataFrame({'Features':feature_names,"Importances":importances})
final_df.set_index('Features',inplace=True)
sorted_importances = final_df.sort_values(by = 'Importances')
sorted_importances.plot(kind='barh')
plt.show()

print(classification_report(y_train,train_predict))
print(classification_report(y_test,test_predict))

In [59]:
df = pd.read_csv('test.csv')
df['L_name'] = df['Name'].apply(lambda x: x.split()[1] if isinstance(x, str) and len(x.split()) > 1 else '')
mode = df.groupby('L_name')['HomePlanet'].apply(lambda x: x.mode().values[0] if not x.mode().empty else x.values[0])
mv_idx = df.HomePlanet.isnull()
df.loc[mv_idx, 'HomePlanet'] = df.loc[mv_idx, 'L_name'].apply(lambda x:mode[x])
# Calculate the mode of the 'HomePlanet' column
mode_value = df['HomePlanet'].mode().iloc[0]

# Fill missing values with the mode
df['HomePlanet'] = df['HomePlanet'].fillna(mode_value)
df.CryoSleep.fillna(df.CryoSleep.mode()[0], inplace = True)
df[['Deck', 'num', 'Side']] = df.Cabin.str.split('/', expand = True)
df.drop(['PassengerId', 'Cabin', 'Name'], axis = 1, inplace = True)
mode = df.groupby('L_name')['Deck'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Deck.isnull()
df.loc[mv_idx, 'Deck'] = df.loc[mv_idx, 'L_name'].apply(lambda x: mode[x])
mode = df.groupby('L_name')['num'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.num.isnull()
df.loc[mv_idx, 'num'] = df.loc[mv_idx, 'L_name'].apply(lambda x: mode[x])
mode = df.groupby('L_name')['Side'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Side.isnull()
df.loc[mv_idx, 'Side'] = df.loc[mv_idx, 'L_name'].apply(lambda x: mode[x])
mode = df.groupby('HomePlanet')['Deck'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Deck.isnull()
df.loc[mv_idx, 'Deck'] = df.loc[mv_idx, 'HomePlanet'].apply(lambda x: mode[x])
mode = df.groupby('HomePlanet')['num'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.num.isnull()
df.loc[mv_idx, 'num'] = df.loc[mv_idx, 'HomePlanet'].apply(lambda x: mode[x])
mode = df.groupby('HomePlanet')['Side'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Side.isnull()
df.loc[mv_idx, 'Side'] = df.loc[mv_idx, 'HomePlanet'].apply(lambda x: mode[x])
mode = df.groupby('HomePlanet')['Destination'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.Destination.isnull()
df.loc[mv_idx, 'Destination'] = df.loc[mv_idx, 'HomePlanet'].apply(lambda x: mode[x])
df.loc[df.Age == 0, 'Age'] = df.loc[df.Age == 0, 'Age'].replace(0,np.nan)
mean = df.groupby(['Side', 'Deck'])['Age'].transform('mean')
df['Age'] = df['Age'].fillna(mean)
mode = df.groupby('L_name')['VIP'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.VIP.isnull()
df.loc[mv_idx, 'VIP'] = df.loc[mv_idx, 'L_name'].apply(lambda x: mode[x])
mode = df.groupby(['Deck'])['VIP'].apply(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
mv_idx = df.VIP.isnull()
df.loc[mv_idx, 'VIP'] = df.loc[mv_idx, 'Deck'].apply(lambda x: mode[x])
mean_RMS = df.groupby(['VIP', 'Deck'])['RoomService'].transform('mean')
df['RoomService'] = df['RoomService'].fillna(mean_RMS)
mean_FC = df.groupby(['VIP', 'Deck'])['FoodCourt'].transform('mean')
df['FoodCourt'] = df['FoodCourt'].fillna(mean_FC)
mean_SM = df.groupby(['VIP', 'Deck'])['ShoppingMall'].transform('mean')
df['ShoppingMall'] = df['ShoppingMall'].fillna(mean_SM)
mean_SPA = df.groupby(['VIP', 'Deck'])['Spa'].transform('mean')
df['Spa'] = df['Spa'].fillna(mean_SPA)
mean_VRD = df.groupby(['VIP', 'Deck'])['VRDeck'].transform('mean')
df['VRDeck'] = df['VRDeck'].fillna(mean_VRD)
df['Total_bill'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis = 1)
df.drop(['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'L_name'], axis = 1, inplace = True)
df.HomePlanet.replace({'Europa':0, 'Earth':1, 'Mars':2}, inplace = True)
df.Destination.replace({'TRAPPIST-1e':0, '55 Cancri e':1, 'PSO J318.5-22':2}, inplace = True)
df.Deck.replace({'B':0, 'F':1, 'A':2, 'G':3, 'E':4, 'D':5, 'C':6, 'T':7}, inplace = True)
df.Side.replace({'P':0, 'S':1}, inplace = True)
df.CryoSleep = df.CryoSleep.astype(int)


df['num'] = df['num'].astype('int')
df.VIP = df.VIP.astype(int)
df1 = pd.get_dummies(df).astype(int)


In [61]:
df1

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Deck,num,Side,Total_bill
0,1,1,0,27,0,3,3,1,0
1,1,0,0,19,0,1,4,1,2832
2,0,1,1,31,0,6,0,1,0
3,0,0,0,38,0,6,1,1,7418
4,1,0,0,20,0,1,5,1,645
...,...,...,...,...,...,...,...,...,...
4272,1,1,0,34,0,3,1496,1,0
4273,1,0,0,42,0,3,141,1,1018
4274,2,1,1,33,0,5,296,0,0
4275,0,0,0,33,0,5,297,0,3203


In [62]:
df1.isnull().sum()

HomePlanet     0
CryoSleep      0
Destination    0
Age            0
VIP            0
Deck           0
num            0
Side           0
Total_bill     0
dtype: int64

In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(max_depth=6, random_state=42)
clf.fit(train_x, train_y)


test_predict = clf.predict(df1)
test_predict

array([1, 0, 1, ..., 1, 0, 1])

In [71]:
test_predict.astype('bool')

array([ True, False,  True, ...,  True, False,  True])

In [75]:
df['Transported'] = test_predict.astype('bool')

In [82]:
df3 = pd.read_csv('test.csv')
df3.drop(['HomePlanet', 'CryoSleep','Cabin', 'Destination', 'Age','VIP','RoomService','FoodCourt','ShoppingMall', 'Spa','VRDeck','Name'], axis = 1, inplace = True)

In [84]:
df3['Transported'] = test_predict.astype('bool')

In [85]:
df3

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [86]:
# Assuming 'df3' is the name of your DataFrame
df3.to_csv('output_file.csv', index=False)
