# Spaceship Titanic

> **Goal**: Predict which passengers are transported to an alternate dimension

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Loading the Dataset

In [2]:
raw_data = pd.read_csv("./data/train.csv")
raw_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Getting Some information about the dataset

In [3]:
raw_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [4]:
raw_data.shape

(8693, 14)

In [14]:
# how many home planets are available in the dataset
raw_data["HomePlanet"].value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [15]:
raw_data["HomePlanet"].count()

8492

In [19]:
raw_data["HomePlanet"].unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [21]:
raw_data["HomePlanet"][raw_data["HomePlanet"] == np.nan].count()

0

In [24]:
raw_data["HomePlanet"].describe()

count      8492
unique        3
top       Earth
freq       4602
Name: HomePlanet, dtype: object

In [27]:
raw_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [30]:
for column_name in raw_data.columns:
    print(column_name, raw_data[column_name].count())

PassengerId 8693
HomePlanet 8492
CryoSleep 8476
Cabin 8494
Destination 8511
Age 8514
VIP 8490
RoomService 8512
FoodCourt 8510
ShoppingMall 8485
Spa 8510
VRDeck 8505
Name 8493
Transported 8693


In [31]:
raw_data["Transported"].describe()

count     8693
unique       2
top       True
freq      4378
Name: Transported, dtype: object

## Data Processing

**Assuming that the NA's are not available and just dropping all**

In [34]:
data_without_na = raw_data.dropna(axis=0, subset=raw_data.columns.drop(["PassengerId", "Transported"]))

In [35]:
data_without_na.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,6606.0,6606.0,6606.0,6606.0,6606.0,6606.0
mean,28.894036,222.991674,478.958523,178.356494,313.16152,303.780048
std,14.533429,644.987936,1678.592291,576.328407,1144.016291,1127.142166
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,82.75,30.0,65.0,52.0
max,79.0,9920.0,29813.0,12253.0,22408.0,20336.0


In [None]:
# saving the cleaned data
data_without_na.to_csv("")