## Imports

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# for hypothesis testing
from scipy import stats 


pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

# 1. Data Quality Report

In [None]:
# Load Data
train_path = '../data/raw/spaceship-titanic/train.csv'
test_path = '../data/raw/spaceship-titanic/test.csv'

# Create fresh copies for analysis
df_train_raw = pd.read_csv(train_path)
df_test_raw = pd.read_csv(test_path) 

df_train = df_train_raw.copy()
df_test = df_test_raw.copy()

print(f'Train Shape: {df_train.shape}')
print(f'Test Shape: {df_test.shape}')

df_train['dataset_label'] = 'train'
df_test['dataset_label']  = 'test'
# Stack them vertically
df = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)


In [51]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,dataset_label
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,train
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,train
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,train
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,train
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,train


In [52]:
# Quick overview of types and missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    12970 non-null  object 
 1   HomePlanet     12682 non-null  object 
 2   CryoSleep      12660 non-null  object 
 3   Cabin          12671 non-null  object 
 4   Destination    12696 non-null  object 
 5   Age            12700 non-null  float64
 6   VIP            12674 non-null  object 
 7   RoomService    12707 non-null  float64
 8   FoodCourt      12681 non-null  float64
 9   ShoppingMall   12664 non-null  float64
 10  Spa            12686 non-null  float64
 11  VRDeck         12702 non-null  float64
 12  Name           12676 non-null  object 
 13  Transported    8693 non-null   object 
 14  dataset_label  12970 non-null  object 
dtypes: float64(6), object(9)
memory usage: 1.5+ MB


In [53]:
# Statistical summary of numerical columns
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [67]:
# Missing Values
df.isnull().sum()

PassengerId         0
HomePlanet        288
CryoSleep         310
Cabin             299
Destination       274
Age               270
VIP               296
RoomService       263
FoodCourt         289
ShoppingMall      306
Spa               284
VRDeck            268
Name              294
Transported      4277
dataset_label       0
Deck              299
Num               299
Side              299
dtype: int64

# 2. Question-Driven EDA

### Hypothesis Testing 1: Did the anomaly hit one side of the ship harder?

Null Hypothesis ($H_0$): There is no statistically significant difference in the mean of the Transported variable between passengers on the P side and the S side in the training dataset.

Alternative Hypothesis ($H_1$): There is a statistically significant difference in the mean of the Transported variable between passengers on the P side and the S side in the training dataset.

The `Cabin` column is formatted as `Deck/Num/Side` (e.g., `B/0/P`). 
To analyze it properly, we need to split it into three separate columns.

In [65]:
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
df[['Deck', 'Num', 'Side']].head()

Unnamed: 0,Deck,Num,Side
0,B,0,P
1,F,0,S
2,A,0,S
3,A,0,S
4,F,1,S


In [None]:
p_side = df[(df['Side'] == 'P') & (df['dataset_label'] == 'train')]
s_side = df[(df['Side'] == 'S') & (df['dataset_label'] == 'train')]

stats.ttest_ind(p_side['Transported'].astype(int), s_side['Transported'].astype(int))

TtestResult(statistic=np.float64(-9.615004953251075), pvalue=np.float64(8.919521930125214e-22), df=np.float64(8492.0))

With a p-value (8.9e-22) being less than 0.05 (as the significance level is 5%), reject the null hypothesis in favor of the alternative hypothesis.