Part 1: Guided Section (40 minutes)

1.1 Environment Setup

In [1]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Scikit-learn: preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Scikit-learn: modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Settings
import warnings
warnings.filterwarnings('ignore')

# For reproducibility
RANDOM_STATE = 42

print("Libraries imported successfully!")

Libraries imported successfully!


1.2 Load the Data

In [2]:
# Data directory
DATA_DIR = './kaggle/input/spaceship-titanic'  # Use this on Kaggle
# DATA_DIR = '../data/spaceship-titanic'  # Use this locally

# Load the data
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')

print(f"Training set shape: {train.shape}")
print(f"Test set shape: {test.shape}")

Training set shape: (8693, 14)
Test set shape: (4277, 13)


In [3]:
# First few rows
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# Data types
print("Data Types:")
print(train.dtypes)

Data Types:
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object


In [5]:
# Missing values
print("Missing Values:")
print(train.isnull().sum())
print(f"\nTotal missing: {train.isnull().sum().sum()}")

Missing Values:
PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Total missing: 2324


In [6]:
# Target distribution
print("Target Distribution:")
print(train['Transported'].value_counts())
print(f"\nPercentage transported: {train['Transported'].mean()*100:.2f}%")

Target Distribution:
Transported
True     4378
False    4315
Name: count, dtype: int64

Percentage transported: 50.36%


In [7]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


1.4 Identify Column Types and Prepare Data

In [8]:
# Identify column types
# We'll drop PassengerId and Name as they're identifiers, not features

id_cols = ['PassengerId', 'Name']
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

print(f"ID columns: {id_cols}")
print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

ID columns: ['PassengerId', 'Name']
Categorical columns: ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
Numerical columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


In [9]:
# Separate target from features
X = train.drop('Transported', axis=1)
y = train['Transported']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (8693, 13)
Target shape: (8693,)


1.5 Train/Validation Split

In [10]:
# Stratified train/validation split - BEFORE any preprocessing
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y  # This ensures balanced split
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")

Training set: 6954 samples
Validation set: 1739 samples


In [11]:
# Verify stratification worked
print("Target distribution verification:")
print(f"Original:   {y.mean()*100:.2f}% transported")
print(f"Training:   {y_train.mean()*100:.2f}% transported")
print(f"Validation: {y_val.mean()*100:.2f}% transported")

Target distribution verification:
Original:   50.36% transported
Training:   50.36% transported
Validation: 50.37% transported


1.6 Preprocessing with sklearn

In [12]:
# Create imputers and encoder
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

print("Preprocessing objects created:")
print(f"  - Numerical imputer: {num_imputer}")
print(f"  - Categorical imputer: {cat_imputer}")
print(f"  - Encoder: {encoder}")

Preprocessing objects created:
  - Numerical imputer: SimpleImputer(strategy='median')
  - Categorical imputer: SimpleImputer(strategy='most_frequent')
  - Encoder: OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)


In [13]:
# FIT on training data ONLY
# This is where we learn the median values and category mappings

# Fit numerical imputer
num_imputer.fit(X_train[numerical_cols])
print(f"Learned medians: {dict(zip(numerical_cols, num_imputer.statistics_))}")

# Fit categorical imputer and encoder
# First impute missing categoricals, then fit the encoder
cat_imputer.fit(X_train[categorical_cols].astype(str))
print(f"\nLearned modes: {dict(zip(categorical_cols, cat_imputer.statistics_))}")

cat_imputed_train = cat_imputer.transform(X_train[categorical_cols].astype(str))
encoder.fit(cat_imputed_train)
print(f"\nEncoder categories learned: {len(encoder.categories_)} columns")

Learned medians: {'Age': np.float64(27.0), 'RoomService': np.float64(0.0), 'FoodCourt': np.float64(0.0), 'ShoppingMall': np.float64(0.0), 'Spa': np.float64(0.0), 'VRDeck': np.float64(0.0)}

Learned modes: {'HomePlanet': 'Earth', 'CryoSleep': 'False', 'Cabin': 'nan', 'Destination': 'TRAPPIST-1e', 'VIP': 'False'}

Encoder categories learned: 5 columns


In [14]:
# TRANSFORM training data
X_train_processed = X_train.copy()

# Transform numerical columns
X_train_processed[numerical_cols] = num_imputer.transform(X_train[numerical_cols])

# Transform categorical columns (impute then encode)
cat_imputed_train = cat_imputer.transform(X_train[categorical_cols].astype(str))
X_train_processed[categorical_cols] = encoder.transform(cat_imputed_train)

# Select only feature columns
X_train_processed = X_train_processed[categorical_cols + numerical_cols]

print(f"Training data processed shape: {X_train_processed.shape}")
print(f"Missing values in training: {X_train_processed.isnull().sum().sum()}")

Training data processed shape: (6954, 11)
Missing values in training: 0


In [15]:
# TRANSFORM validation data using the SAME fitted transformers
X_val_processed = X_val.copy()

# Transform numerical columns
X_val_processed[numerical_cols] = num_imputer.transform(X_val[numerical_cols])

# Transform categorical columns (impute then encode)
cat_imputed_val = cat_imputer.transform(X_val[categorical_cols].astype(str))
X_val_processed[categorical_cols] = encoder.transform(cat_imputed_val)

# Select only feature columns
X_val_processed = X_val_processed[categorical_cols + numerical_cols]

print(f"Validation data processed shape: {X_val_processed.shape}")
print(f"Missing values in validation: {X_val_processed.isnull().sum().sum()}")

Validation data processed shape: (1739, 11)
Missing values in validation: 0


In [16]:
# Verify preprocessing
print("Processed training data:")
X_train_processed.head()

Processed training data:


Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,VIP,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
3600,0.0,0.0,4970.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1262,0.0,1.0,4430.0,2.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0
8612,0.0,2.0,4345.0,1.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0
5075,1.0,1.0,980.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0
4758,0.0,0.0,5203.0,2.0,0.0,13.0,0.0,0.0,60.0,1.0,5147.0


1.7 Train Random Forest

In [17]:
# Initialize the model
model = RandomForestClassifier(
    n_estimators=100,      # Number of trees
    random_state=RANDOM_STATE
)

# Train the model
model.fit(X_train_processed, y_train)

print("Model trained successfully!")

Model trained successfully!


In [18]:
# Make predictions on validation set
y_val_pred = model.predict(X_val_processed)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy*100:.2f}%")

Validation Accuracy: 0.7895
Validation Accuracy: 78.95%


1.8 Create Kaggle Submission

In [19]:
# Create NEW preprocessing objects for final submission
num_imputer_final = SimpleImputer(strategy='median')
cat_imputer_final = SimpleImputer(strategy='most_frequent')
encoder_final = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit on the FULL training set
num_imputer_final.fit(X[numerical_cols])
cat_imputer_final.fit(X[categorical_cols].astype(str))
cat_imputed_full = cat_imputer_final.transform(X[categorical_cols].astype(str))
encoder_final.fit(cat_imputed_full)

print("Final preprocessing objects fitted on full training data")

Final preprocessing objects fitted on full training data


In [20]:
# Transform the full training set
X_full = X.copy()
X_full[numerical_cols] = num_imputer_final.transform(X[numerical_cols])
cat_imputed = cat_imputer_final.transform(X[categorical_cols].astype(str))
X_full[categorical_cols] = encoder_final.transform(cat_imputed)
X_full = X_full[categorical_cols + numerical_cols]

print(f"Full training set processed: {X_full.shape}")

Full training set processed: (8693, 11)


In [21]:
# Transform the test set
test_processed = test.copy()
test_processed[numerical_cols] = num_imputer_final.transform(test[numerical_cols])
cat_imputed_test = cat_imputer_final.transform(test[categorical_cols].astype(str))
test_processed[categorical_cols] = encoder_final.transform(cat_imputed_test)
test_processed = test_processed[categorical_cols + numerical_cols]

print(f"Test set processed: {test_processed.shape}")
print(f"Missing values in test: {test_processed.isnull().sum().sum()}")

Test set processed: (4277, 11)
Missing values in test: 0


In [22]:
# Train final model on full training data
final_model = RandomForestClassifier(
    n_estimators=100,
    random_state=RANDOM_STATE
)
final_model.fit(X_full, y)

# Make predictions on test set
test_predictions = final_model.predict(test_processed)

print(f"Number of predictions: {len(test_predictions)}")
print(f"Prediction distribution: {pd.Series(test_predictions).value_counts().to_dict()}")

Number of predictions: 4277
Prediction distribution: {True: 2230, False: 2047}


In [23]:
# Create submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': test_predictions
})

# Verify format
print("Submission preview:")
submission.head()

Submission preview:


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


In [24]:
# Save submission
submission.to_csv('./kaggle/input/spaceship-titanic/submission_baseline.csv', index=False)
print("Submission saved to 'submission_baseline.csv'")
print(f"\nSubmission shape: {submission.shape}")

Submission saved to 'submission_baseline.csv'

Submission shape: (4277, 2)
