In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df  = pd.read_csv('../Data/train.csv')
df.head()

In [None]:
def engineer_features(df):
    df = df.copy()
    # Extract features from Cabin
    df['Deck'] = df['Cabin'].str.split('/').str[0]
    df['Side'] = df['Cabin'].str.split('/').str[2]
    df['CabinNumber'] = df['Cabin'].str.split('/').str[1].fillna('0').astype(int)

    # Extract group information from PassengerId
    df['Group'] = df['PassengerId'].str.split('_').str[0]
    df['NumberInGroup'] = df['PassengerId'].str.split('_').str[1].astype(int)

    # Create spending features
    spending_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpending'] = df[spending_features].sum(axis=1)
    df['HasSpending'] = (df['TotalSpending'] > 0).astype(int)
    df['SpendingRatio'] = df['RoomService'] / (df['TotalSpending'] + 1)  # Avoid division by zero

    # Create age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 30, 50, 100], 
                            labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])

    # Family size based on group (simplified)
    group_sizes = df['Group'].value_counts()
    df['GroupSize'] = df['Group'].map(group_sizes)

    # Drop original columns we've engineered from
    df.drop(['Cabin', 'Name'], axis=1, inplace=True)
    return df

# Use it for both train and test
train_df = engineer_features(df)


In [None]:
train_df['Transported_Updated'] = train_df['Transported'].astype(int)
train_df['Transported_Updated'].value_counts()

df.drop(columns=['Transported'], inplace=True)

In [None]:
X = train_df.drop(columns='Transported_Updated')
Y = train_df['Transported_Updated']

numerical_features = X.select_dtypes(include=['float64', 'int32'])
categorical_features = X.select_dtypes(include=['object'])


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features.columns),
        ('cat', categorical_transformer, categorical_features.columns)
    ]
)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=23)

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb


pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBClassifier(
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False,
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8
    ))
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__max_depth': [3, 4, 5],
    'model__learning_rate': [0.05, 0.1, 0.2],
    'model__subsample': [0.8, 0.9],
    'model__colsample_bytree': [0.8, 0.9]
}

grid_search = GridSearchCV(
    pipe, 
    param_grid, 
    cv=3, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, Y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


In [None]:
grid_search.fit(X_train, Y_train)

In [None]:
Y_train_pred = grid_search.predict(X_train)
accuracy_score(Y_train, Y_train_pred)

In [None]:
Y_test_pred = grid_search.predict(X_test)
accuracy_score(Y_test, Y_test_pred)

In [None]:
test_df = pd.read_csv('../Data/test.csv')
test_df = engineer_features(test_df)


In [None]:
test_df_pred = grid_search.predict(test_df)

In [None]:
print(test_df_pred)

In [None]:
test_df_pred_bool = []
for i in test_df_pred:
    j =bool(i)
    test_df_pred_bool.append(j)

In [None]:
answers = {
    'PassengerId' : test_df['PassengerId'], 
    'Transported' : test_df_pred_bool 
    }

In [58]:
answers_df = pd.DataFrame(answers)

In [None]:
answers_df.head()

In [None]:
answers_df.to_csv('../Answer/answers.csv', index=False)

In [None]:
import joblib

# Save
joblib.dump(grid_search, "model.joblib")

# Load
# loaded_model = joblib.load("model.joblib")

# y_pred = loaded_model.predict(test_df)
