<a href="https://www.kaggle.com/code/vanpatangan/spaceship-classifier-lgbm?scriptVersionId=197192156" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

# Load data

In [None]:
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Data Summary

In [None]:
def check(df):
    """
    Generates a concise summary of DataFrame columns.
    """
    summary = []
    
    for col in df.columns:
        total_rows = len(df)
        num_nulls = df[col].isnull().sum()
        percent_nulls = round((num_nulls / total_rows) * 100, 2)
        
        col_data = [
            col,                    # Column name
            df[col].dtype,          # Data type of the column
            df[col].count(),        # Number of non-null values
            df[col].nunique(),      # Number of unique values
            num_nulls,              # Number of null values
            percent_nulls,          # Percentage of null values
            df.duplicated().sum()   # Number of duplicate rows in the DataFrame
        ]
        summary.append(col_data)
    
    df_check = pd.DataFrame(
        summary,
        columns=["column", "dtype", "instances", "unique", "sum_null", "percent_null", "duplicates"]
    )
    
    return df_check


In [None]:
check(train_df)

In [None]:
check(test_df)

*💡 On the feature level, both dataframes have low proportion of null values*

# EDA

**Visualize Distribution**

In [None]:
plt.figure(figsize=(12, 4.5))
color = sns.color_palette("Set2")
plot = sns.histplot(data=train_df, x="Age",hue="Transported",
                    multiple="stack", kde=True, bins=20, label="Age", palette=color)
plot.set_title("Age Distribution By Transported")

*💡 It seems (0-18) children are more likely to be transported*

**Numerical Columns**

In [None]:
# Initialize the grid
fig, axes = plt.subplots(3, 2, figsize=(10, 8))
fig.delaxes(axes[2, 1])  # remove the empty subplot

# List of variables to plot
numerical_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Plot each variable
for i, var in enumerate(numerical_columns):
    sns.histplot(train_df[var], ax=axes[i // 2, i % 2], color='purple', bins=20)
    axes[i // 2, i % 2].set_title(f'Distribution of {var}')

# Adjust layout
plt.tight_layout()
plt.show()

*💡 Numerical columns are highly skewed*

**Categorical Columns**

In [None]:
# Categorical columns to plot
categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Define a colormap
colors = sns.color_palette("Set2")

# Create subplots 
fig, axes = plt.subplots(len(categorical_columns), 2, figsize=(16, 12))

for i, col in enumerate(categorical_columns):
    value_counts = train_df[col].value_counts(dropna=False)
    
    # Bar Plot
    sns.countplot(x=col, data=train_df, ax=axes[i, 0], palette=colors)
    axes[i, 0].set_title(f'Count Plot of {col}')
    axes[i, 0].set_ylabel('Count')
    axes[i, 0].set_xlabel(col)
    
    # Pie Chart
    axes[i, 1].pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', colors=colors, startangle=140)
    axes[i, 1].set_title(f'Pie Chart of {col}')

plt.tight_layout()
plt.show()

**Correlation Matrix**

In [None]:
from sklearn.preprocessing import LabelEncoder

obj_col = train_df.select_dtypes(include='object').columns.tolist()

corr_new_df = train_df.copy()

le = { col: LabelEncoder() for col in obj_col}
for col in obj_col:
    corr_new_df[col] = le[col].fit_transform(corr_new_df[col])

plt.figure(figsize=(13,8))
corr = corr_new_df.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(corr)]=True

sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu',mask=mask)

# Feature Engineering 

In [None]:
# Create new features 
train_df[['Deck', 'CabinNum', 'Side']] = train_df['Cabin'].str.split('/', expand=True)
train_df['GroupSize'] = train_df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
train_df['TotalSpending'] = train_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
train_df['CryoSleepAndZeroExpenditure'] = ((train_df['CryoSleep'] == 'True') & (train_df['TotalSpending'] == 0)).astype(int)
train_df['IsVIP'] = train_df['VIP'] == 'True'
train_df['TravelAlone'] = train_df['GroupSize'] == 1
train_df['FirstName'] = train_df['Name'].str.split().str[0]
train_df['LastName'] = train_df['Name'].str.split().str[-1]
train_df['NameLength'] = train_df['Name'].str.len()

In [None]:
# Set plot style
sns.set(style="whitegrid")

# Distribution of Decks
plt.figure(figsize=(10, 6))
sns.countplot(x='Deck', data=train_df, palette='Set2')
plt.title('Distribution of Decks')
plt.show()

# Distribution of Sides
plt.figure(figsize=(10, 6))
sns.countplot(x='Side', data=train_df, palette='Set2')
plt.title('Distribution of Sides')
plt.show()

In [None]:
# Group Size Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['GroupSize'], bins=20, kde=False, color='coral')
plt.title('Group Size Distribution')
plt.show()

In [None]:
# Total Spending Distribution
plt.figure(figsize=(10, 6))
sns.histplot(train_df['TotalSpending'], bins=30, kde=False, color='green')
plt.title('Total Spending Distribution')
plt.show()

In [None]:
# Traveling Alone
plt.figure(figsize=(10, 6))
sns.countplot(x='TravelAlone', data=train_df, palette='Set2')
plt.title('Traveling Alone')
plt.show()

# Preprocessing 

In [None]:
# Separate features and target
X = train_df.drop(columns=['Transported', 'Name', 'PassengerId'])
y = train_df['Transported']

In [None]:
# Define categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['float64']).columns

In [None]:
# Define transformers for preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the pipeline
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(verbose=0,random_state=42))
])

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeling

In [None]:
# Hyperparameter tuning
# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.3],
    'classifier__num_leaves': [20, 50, 80, 100]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=0)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters found: ", best_params)
print("Best cross-validation accuracy: {:.4f}".format(best_score))

# Validate the model
val_score = grid_search.score(X_val, y_val)
print("Validation accuracy: {:.4f}".format(val_score))

# New Features on the Test data

In [None]:
# Apply new features on the test data
test_df[['Deck', 'CabinNum', 'Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df['GroupSize'] = test_df['PassengerId'].apply(lambda x: int(x.split('_')[1]))
test_df['TotalSpending'] = train_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
test_df['CryoSleepAndZeroExpenditure'] = ((test_df['CryoSleep'] == 'True') & (test_df['TotalSpending'] == 0)).astype(int)
test_df['IsVIP'] = test_df['VIP'] == 'True'
test_df['TravelAlone'] = train_df['GroupSize'] == 1
test_df['FirstName'] = train_df['Name'].str.split().str[0]
test_df['LastName'] = train_df['Name'].str.split().str[-1]
test_df['NameLength'] = train_df['Name'].str.len()

# Preprocess Test Data

In [None]:
# Separate features from the test data
X_test = test_df.drop(columns=['Name', 'PassengerId'])

# Transform the test data using the fitted pipeline (excluding the SMOTE step)
# Since we cannot use SMOTE on test data, we create a new pipeline without SMOTE for transformation and prediction
pipeline_without_smote = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', grid_search.best_estimator_.named_steps['classifier'])
])

# Fit the pipeline without SMOTE 
pipeline_without_smote.fit(X, y)

# Use the fitted pipeline to transform and predict the test data
predictions = pipeline_without_smote.predict(X_test)

# Test Predictions

In [None]:
# Create a DataFrame with PassengerId and the predictions
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': predictions
})

# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")
print(submission_df.head())