In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


In [None]:
# Load data
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")


In [None]:
# === Basic Cleaning ===

# 1. Fill numeric columns with median
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].median())
test_df[num_cols] = test_df[num_cols].fillna(test_df[num_cols].median())

# 2. Fill categorical columns with mode
cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
for col in cat_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
        train_df[col] = train_df[col].infer_objects(copy=False)
    if col in test_df.columns:
        test_df[col] = test_df[col].fillna(test_df[col].mode()[0])
        test_df[col] = test_df[col].infer_objects(copy=False)
        # Load original data for passenger IDs
original_train = pd.read_csv('/content/train.csv')
original_test = pd.read_csv('/content/test.csv')

train_ids = original_train['PassengerId']
test_ids = original_test['PassengerId']

train_df['Group'] = train_ids.str.split('_').str[0]
test_df['Group'] = test_ids.str.split('_').str[0]

train_df['GroupSize'] = train_df.groupby('Group')['Group'].transform('count')
test_df['GroupSize'] = test_df.groupby('Group')['Group'].transform('count')

train_df['IsSolo'] = (train_df['GroupSize'] == 1).astype(int)
test_df['IsSolo'] = (test_df['GroupSize'] == 1).astype(int)

train_df.drop(columns='Group', inplace=True)
test_df.drop(columns='Group', inplace=True)

# Now drop PassengerId and Name safely
train_df.drop(columns=['PassengerId', 'Name'], inplace=True)
test_df.drop(columns=['PassengerId', 'Name'], inplace=True)





  train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
  test_df[col] = test_df[col].fillna(test_df[col].mode()[0])


In [None]:
# Outlier capping function (winsorizing)
def cap_outliers(df, cols, lower_quantile=0.01, upper_quantile=0.99):
    df = df.copy()
    for col in cols:
        lower = df[col].quantile(lower_quantile)
        upper = df[col].quantile(upper_quantile)
        df[col] = df[col].clip(lower, upper)
    return df

In [None]:
# Feature engineering function
def feature_engineering(df):
    df = df.copy()

    # Extract group info
    df['Group'] = df['PassengerId'].str.split('_').str[0]
    df['GroupSize'] = df.groupby('Group')['PassengerId'].transform('count')
    df['IsSolo'] = (df['GroupSize'] == 1).astype(int)

    # Cabin split
    cabin_split = df['Cabin'].str.split('/', expand=True)
    df['Deck'] = cabin_split[0]
    df['CabinNum'] = pd.to_numeric(cabin_split[1], errors='coerce')
    df['Side'] = cabin_split[2]

    # Spending columns
    spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpend'] = df[spend_cols].sum(axis=1)
    df['SpendPerGroupMember'] = df['TotalSpend'] / df['GroupSize']

    # Flag no spending
    df['NoSpend'] = (df[spend_cols].fillna(0).sum(axis=1) == 0).astype(int)

    # Age groups
    df['AgeGroup'] = pd.cut(
        df['Age'],
        bins=[-1, 12, 18, 25, 40, 60, 120],
        labels=['child', 'teen', 'student', 'young_adult', 'adult', 'senior']
    )

    # Age related flags
    df['IsMinor'] = (df['Age'] < 18).astype(int)
    df['IsSenior'] = (df['Age'] > 60).astype(int)

    # Name length
    df['NameLength'] = df['Name'].fillna('').apply(len)

    # Luxury deck flag
    luxury_decks = ['A', 'B', 'T']
    df['IsLuxuryDeck'] = df['Deck'].isin(luxury_decks).astype(int)

    # Interaction feature
    df['AgeSpend'] = df['Age'] * df['TotalSpend']

    # CabinNum missing flag
    df['CabinNumMissing'] = df['CabinNum'].isna().astype(int)

    # Drop unused columns
    df = df.drop(columns=['PassengerId', 'Name', 'Group', 'Cabin'])

    return df

# Apply feature engineering
train_fe = feature_engineering(train_df)
test_fe = feature_engineering(test_df)

# List of numeric columns to cap outliers on
num_outlier_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
                    'TotalSpend', 'SpendPerGroupMember', 'AgeSpend', 'CabinNum', 'NameLength']

# Cap outliers in train set only (don't modify test target distribution)
train_fe = cap_outliers(train_fe, num_outlier_cols)

In [None]:
# Separate target and features
y = train_fe['Transported'].astype(int)
X = train_fe.drop(columns='Transported')



In [None]:
# Define numeric and categorical columns
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
            'TotalSpend', 'SpendPerGroupMember', 'NameLength', 'AgeSpend', 'CabinNum']

cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'AgeGroup',
            'IsSolo', 'IsMinor', 'IsSenior', 'IsLuxuryDeck', 'NoSpend', 'CabinNumMissing']

# Pipelines for preprocessing
numeric_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])



preprocessor = ColumnTransformer([
    ('num', numeric_pipe, num_cols),
    ('cat', categorical_pipe, cat_cols)
])


In [None]:
# Define the model
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

# Full pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])

# Hyperparameter grid for Randomized Search
param_dist = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__learning_rate': [0.01, 0.03, 0.05, 0.1],
    'classifier__subsample': [0.6, 0.7, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 1.0],
    'classifier__min_child_weight': [1, 3, 5]
}

# Stratified K-Fold CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize RandomizedSearchCV without early stopping params
search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,
    scoring='accuracy',
    n_jobs=-1,
    cv=cv,
    verbose=2,
    random_state=42,
    refit=True
)

# Run hyperparameter tuning without early stopping parameters
search.fit(X, y)

print(f"Best CV accuracy: {search.best_score_:.5f}")
print("Best hyperparameters:", search.best_params_)

# Best model retrained on full data
best_model = search.best_estimator_

# Predict on test data
test_preds = best_model.predict(test_fe)

# Prepare submission file
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': test_preds.astype(bool)
})

submission.to_csv('/content/submission.csv', index=False)
print("Submission saved to /content/submission.csv")


Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best CV accuracy: 0.80950
Best hyperparameters: {'classifier__subsample': 1.0, 'classifier__n_estimators': 100, 'classifier__min_child_weight': 3, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.1, 'classifier__colsample_bytree': 0.7}
Submission saved to /content/submission.csv


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split


X_train_dl, X_val_dl, y_train_dl, y_val_dl = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Build a simple MLP model
model = models.Sequential([
    layers.Input(shape=(X_scaled.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train model with early stopping
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5, restore_best_weights=True
)

history = model.fit(
    X_train_dl, y_train_dl,
    validation_data=(X_val_dl, y_val_dl),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=2
)

# Evaluate on validation set
val_loss, val_acc = model.evaluate(X_val_dl, y_val_dl, verbose=0)
print(f"\nValidation Accuracy: {val_acc:.4f}")


