In [1]:
import pandas as pd

In [2]:
full_df = pd.read_csv('../dataset/train.csv')
df = full_df.copy()

In [3]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2)

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def update_train_df(dataframe):
    imputers = {
        'name_imputer': SimpleImputer(strategy='constant', fill_value='Unknown Unknown'),
        'num_imputer': SimpleImputer(strategy='median'),
        'cat_imputer': SimpleImputer(strategy='constant', fill_value='Unknown'),
    }

    num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
    
    df = dataframe.copy()

    # Impute Part
    df.loc[:, 'Name'] = imputers['name_imputer'].fit_transform(df.loc[:, 'Name'].to_frame())
    df.loc[:, num_col] = imputers['num_imputer'].fit_transform(df.loc[:, num_col])
    df.loc[:, cat_cols] = imputers['cat_imputer'].fit_transform(df.loc[:, cat_cols])

    # Feature engineering Part
    df.loc[:, 'FirstName'] = df.loc[:, 'Name'].apply(lambda x: x.split()[0])
    df.loc[:, 'LastName'] = df.loc[:, 'Name'].apply(lambda x: x.split()[1])

    df.loc[:, 'PassengerGGGG'] = df.loc[:, 'PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df.loc[:, 'PassengerPP'] = df.loc[:, 'PassengerId'].apply(lambda x: int(x.split('_')[1]))
    
    df.loc[:, 'NumberOfFellows'] = df.groupby('PassengerGGGG')['PassengerGGGG'].transform('count')
    df.loc[:, 'IsAlone'] = df.loc[:, 'NumberOfFellows'].apply(lambda x: 1 if x == 1 else 0)

    df['cabin_deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if x != 'Unknown' else 'U')
    df['cabin_num'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if x != 'Unknown' else -1)
    df['cabin_side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if x != 'Unknown' else 'U')

    df.loc[:, 'TotalSpend'] = df.loc[:, ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    df['IsSpendingZero'] = (df['TotalSpend'] == 0).astype(int)

    df.loc[:, 'n_family_member_in_group'] = df.groupby(['PassengerGGGG', 'LastName'])['LastName'].transform('count')
        
    # Removing unnecessary columns
    df.drop(['Cabin', 'Name', 'PassengerId'], inplace=True, axis=1)
    df.drop(['FirstName', 'LastName', 'PassengerGGGG'], inplace=True, axis=1)

    # Encoding 
    cat_data = df.select_dtypes('object')
    cat_data.loc[:, ['CryoSleep', 'VIP']] = cat_data.loc[:, ['CryoSleep', 'VIP']].astype('str')
    
    one_hot_encoder = OneHotEncoder(drop='first', dtype=int)
    
    cat_data_encoded = one_hot_encoder.fit_transform(cat_data)
    encoded_feature_names = one_hot_encoder.get_feature_names_out(cat_data.columns)
    
    cat_encoded = pd.DataFrame(cat_data_encoded.toarray(), columns = encoded_feature_names, index=cat_data.index)
    
    non_cat = df.drop(cat_data.columns, axis=1)
    df = pd.concat([non_cat, cat_encoded], axis=1)
    
    df.loc[:, 'Transported'] = df.loc[:, 'Transported'].astype(int)
    
    return df, imputers, one_hot_encoder

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def update_test_df(dataframe, imputers, one_hot_encoder):
    num_col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    cat_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
    
    df = dataframe.copy()

    # Impute Part
    df.loc[:, 'Name'] = imputers['name_imputer'].transform(df.loc[:, 'Name'].to_frame())
    df.loc[:, num_col] = imputers['num_imputer'].transform(df.loc[:, num_col])
    df.loc[:, cat_cols] = imputers['cat_imputer'].transform(df.loc[:, cat_cols])

    # Feature engineering Part
    df.loc[:, 'FirstName'] = df.loc[:, 'Name'].apply(lambda x: x.split()[0])
    df.loc[:, 'LastName'] = df.loc[:, 'Name'].apply(lambda x: x.split()[1])

    df.loc[:, 'PassengerGGGG'] = df.loc[:, 'PassengerId'].apply(lambda x: int(x.split('_')[0]))
    df.loc[:, 'PassengerPP'] = df.loc[:, 'PassengerId'].apply(lambda x: int(x.split('_')[1]))
    
    df.loc[:, 'NumberOfFellows'] = df.groupby('PassengerGGGG')['PassengerGGGG'].transform('count')
    df.loc[:, 'IsAlone'] = df.loc[:, 'NumberOfFellows'].apply(lambda x: 1 if x == 1 else 0)

    df['cabin_deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if x != 'Unknown' else 'U')
    df['cabin_num'] = df['Cabin'].apply(lambda x: int(x.split('/')[1]) if x != 'Unknown' else -1)
    df['cabin_side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if x != 'Unknown' else 'U')

    df.loc[:, 'TotalSpend'] = df.loc[:, ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    df['IsSpendingZero'] = (df['TotalSpend'] == 0).astype(int)

    df.loc[:, 'n_family_member_in_group'] = df.groupby(['PassengerGGGG', 'LastName'])['LastName'].transform('count')
    
    # Removing unnecessary columns
    df.drop(['Cabin', 'Name', 'PassengerId'], inplace=True, axis=1)
    df.drop(['FirstName', 'LastName', 'PassengerGGGG'], inplace=True, axis=1)

    # Encoding
    cat_data = df.select_dtypes('object')
    cat_data.loc[:, ['CryoSleep', 'VIP']] = cat_data.loc[:, ['CryoSleep', 'VIP']].astype('str')
        
    cat_data_encoded = one_hot_encoder.transform(cat_data)
    encoded_feature_names = one_hot_encoder.get_feature_names_out(cat_data.columns)
    
    cat_encoded = pd.DataFrame(cat_data_encoded.toarray(), columns = encoded_feature_names, index=cat_data.index)
    
    non_cat = df.drop(cat_data.columns, axis=1)
    df = pd.concat([non_cat, cat_encoded], axis=1)
    
    return df

In [6]:
df_train_updated, train_imputers, train_ohe = update_train_df(df_train) 

df_test_updated = update_test_df(df_test, train_imputers, train_ohe)

X_train = df_train_updated.drop('Transported', axis=1)
y_train = df_train_updated.loc[:, 'Transported']

X_test = df_test_updated.drop('Transported', axis=1)
y_test = df_test_updated.loc[:, 'Transported']

  df.loc[:, 'Transported'] = df.loc[:, 'Transported'].astype(int)


In [7]:
from sklearn.ensemble import (
    VotingClassifier, StackingClassifier,
    RandomForestClassifier, BaggingClassifier,
    ExtraTreesClassifier, AdaBoostClassifier
)
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [26]:
xgb_model = XGBClassifier(
    n_estimators=690,
    max_depth=11,
    learning_rate=0.027599754249374484,
    subsample=0.9116649086760387,
    colsample_bytree=0.9721498601228151,
    gamma=1.2251505971550578,
    reg_alpha=0.9135594878672821,
    reg_lambda=0.23339826491201715,
    eval_metric='logloss',
    random_state=42
)

rf_model = RandomForestClassifier(
    n_estimators=813,
    max_depth=19,
    min_samples_split=16,
    min_samples_leaf=1,
    max_features='log2',
    bootstrap=True,
    random_state=42
)

bag_model = BaggingClassifier(
    n_estimators=88,
    max_samples=0.6205453147546152,
    max_features=0.7264387059650675,
    bootstrap=True,
    random_state=42
)

et_model = ExtraTreesClassifier(
    n_estimators=569,
    max_depth=39,
    min_samples_split=3,
    min_samples_leaf=13,
    max_features=None,
    random_state=42
)

ada_model = AdaBoostClassifier(
    n_estimators=387,
    learning_rate=0.26922125547729353,
    random_state=42
)

lgbm_model = LGBMClassifier(
    num_leaves=23,
    max_depth=12,
    learning_rate=0.04750121729656987,
    n_estimators=222,
    subsample=0.8052084722704039,
    colsample_bytree=0.9802854475796334,
    reg_alpha=0.45245445482085095,
    reg_lambda=0.12909159687822422,
    min_child_samples=6,
    random_state=42
)

svc_model = SVC(
    C=6.5326956537464715,
    kernel='poly',
    degree=2,
    gamma='auto',
    coef0=0.629725471561754,
    probability=True,
    random_state=42
)

calibrated_linsvc = CalibratedClassifierCV(
    estimator=LinearSVC(
        C=0.9055783406210581,
        loss='squared_hinge',
        random_state=42,
        max_iter=5000
    ),
    method='sigmoid',
    cv=5
)

nu_svc_model = NuSVC(
    kernel='rbf',
    nu=0.44372674202060064,
    gamma='auto',
    coef0=0.24019318229491016,
    probability=True,
    random_state=42
)

calibrated_sgd = CalibratedClassifierCV(
    estimator=SGDClassifier(random_state=42),
    method='isotonic',
    cv=9
)

logreg_model = LogisticRegression(
    solver='liblinear',
    penalty='l2',
    C=0.006499032640858296,
    random_state=42
)

lda_model = LinearDiscriminantAnalysis(
    solver='eigen',
    shrinkage='auto'
)

In [27]:
voting_soft = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('bag', bag_model),
        ('et', et_model),
        ('ada', ada_model),
        ('lgbm', lgbm_model),
        ('svc', make_pipeline(StandardScaler(), svc_model)),
        ('nusvc', make_pipeline(StandardScaler(), nu_svc_model)),
        ('linsvc', make_pipeline(StandardScaler(), calibrated_linsvc)),
        ('cal_sgd', calibrated_sgd),
        ('logreg', make_pipeline(StandardScaler(), logreg_model)),
        ('lda', make_pipeline(StandardScaler(), lda_model))
    ],
    voting='soft',
    n_jobs=-1
)

In [28]:
voting_soft.fit(X_train, y_train)
y_soft = voting_soft.predict(X_test)
print("Soft Voting Accuracy:", accuracy_score(y_test, y_soft))



[LightGBM] [Info] Number of positive: 3484, number of negative: 3470
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1932
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501007 -> initscore=0.004026
[LightGBM] [Info] Start training from score 0.004026
Soft Voting Accuracy: 0.816561242093157


In [29]:
voting_hard = VotingClassifier(
    estimators=voting_soft.estimators,
    voting='hard',
    n_jobs=-1
)
voting_hard.fit(X_train, y_train)
y_hard = voting_hard.predict(X_test)
print("Hard Voting Accuracy:", accuracy_score(y_test, y_hard))



[LightGBM] [Info] Number of positive: 3484, number of negative: 3470
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1932
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501007 -> initscore=0.004026
[LightGBM] [Info] Start training from score 0.004026
Hard Voting Accuracy: 0.8102357676825762


In [30]:
stacking = StackingClassifier(
    estimators=voting_soft.estimators,
    final_estimator=make_pipeline(StandardScaler(),
                                  LogisticRegression(random_state=42, max_iter=1000)),
    passthrough=True,
    n_jobs=-1
)
stacking.fit(X_train, y_train)
y_stack = stacking.predict(X_test)
print("Stacking Accuracy:", accuracy_score(y_test, y_stack))



[LightGBM] [Info] Number of positive: 3484, number of negative: 3470
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002658 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1932
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501007 -> initscore=0.004026
[LightGBM] [Info] Start training from score 0.004026




[LightGBM] [Info] Number of positive: 2787, number of negative: 2776
[LightGBM] [Info] Number of positive: 2788, number of negative: 2776
[LightGBM] [Info] Number of positive: 2787, number of negative: 2776
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1930
[LightGBM] [Info] Number of positive: 2787, number of negative: 2776
[LightGBM] [Info] Number of data points in the train set: 5563, number of used features: 32
[LightGBM] [Info] Number of positive: 2787, number of negative: 2776
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500989 -> initscore=0.003955
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Start training from score 0.003955
[LightGBM] [Info] Total Bins 1931
[LightGBM] [Info] Number of data points

In [32]:
train, train_imputers, train_ohe = update_train_df(full_df)

test_df = pd.read_csv('../dataset/test.csv')
test = update_test_df(test_df, train_imputers, train_ohe)

X = train.drop('Transported', axis=1)
y = train.loc[:, 'Transported']

  df.loc[:, 'Transported'] = df.loc[:, 'Transported'].astype(int)


In [33]:
voting_soft.fit(X, y)
y_soft = voting_soft.predict(test).astype(bool)



[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002613 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1933
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


In [34]:
voting_hard.fit(X, y)
y_hard = voting_hard.predict(test).astype(bool)



[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1933
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


In [35]:
stacking.fit(X, y)
y_stack = stacking.predict(test).astype(bool)



[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001921 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1933
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495




[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006901 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1933
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `

In [36]:
test_df = pd.read_csv('../dataset/test.csv')


submission_soft = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported' : y_soft.astype(bool)
})
submission_soft.to_csv('new_ensemble_soft.csv', index=False)

In [37]:
test_df = pd.read_csv('../dataset/test.csv')


submission_hard = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported' : y_hard.astype(bool)
})
submission_hard.to_csv('new_ensemble_host.csv', index=False)

In [38]:
test_df = pd.read_csv('../dataset/test.csv')


submission_stacking = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported' : y_stack.astype(bool)
})
submission_stacking.to_csv('new_ensemble_stacking.csv', index=False)