# Spaceship Titanic raw data preprocessing

In [1]:
import os
import dotenv
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 1 - Load environmental variables

In [2]:
project_dir = str(Path().resolve().parents[0])
dotenv_path = os.path.join(project_dir, '.env')
env_var = dotenv.load_dotenv(dotenv_path)
raw_data_path = os.environ.get("RAW_DATA_PATH")

## 2 - Read raw data
The infer_objects function automatically identifies the dtypes for each column

In [3]:
raw_data = pd.read_csv(os.path.join(project_dir, raw_data_path, "train.csv")).infer_objects()
raw_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


### 2.1 Checking columns dtypes

In [4]:
print(f"Number of columns: {raw_data.shape[1]}")
raw_data.dtypes

Number of columns: 14


PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [5]:
numerical_cols = raw_data.select_dtypes(include="float64").columns.to_list()
categorical_cols = raw_data.select_dtypes(include="object").columns.to_list()
target_col = raw_data.select_dtypes(include="bool").columns.to_list()[0]
print(f"Numerical columns: {numerical_cols}")
print(f"Categorival columns: {categorical_cols}")
print(f"Target column: {target_col}")

Numerical columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
Categorival columns: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
Target column: Transported


## 3 - Removing ID type columns

In [8]:
print(raw_data[categorical_cols].nunique())
id_types_cols = raw_data.columns[raw_data.nunique() == raw_data.shape[0]].to_list()
pre_processed_data = raw_data.set_index(id_types_cols)
print(f"Removed ID type columns: {id_types_cols}")
# Seems that the Name column presents equal names. 
# Are there different entries for the same person?
# Are these entries equal o different?
# We must check for equal rows and also equal rows with different classes.

PassengerId    8693
HomePlanet        3
CryoSleep         2
Cabin          6560
Destination       3
VIP               2
Name           8473
dtype: int64
Removed ID type columns: ['PassengerId']


## 4 - Missing data imputation

In [None]:

print(f"Number of NaN values: {pre_processed_data.isna().sum().sum()}/{pre_processed_data.size}")
print(f"Number of columns with NaN: {pre_processed_data.isna().any().sum()}/{pre_processed_data.shape[1]}")
print(f"Number of rows with NaN: {pre_processed_data.isna().any(axis=1).sum()}/{pre_processed_data.shape[0]}")
# ToDo: 
# We need to inputate missing data for all columns 
# We need to check if this is possible


Number of NaN values: 2324/113009
Number of columns with NaN: 12/13
Number of rows with NaN: 2087/8693
