In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
df_test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.isnull().sum()

In [None]:
df_train.describe()

In [None]:
df_train["Transported"].value_counts().plot(kind="bar")
# Transported, TRUE = FALSE

In [None]:
df_train["Transported"] = df_train["Transported"].apply(lambda x:int(x))

df_train[["Deck", "Num", "Side"]] = df_train["Cabin"].str.split("/", expand = True)
df_test[["Deck", "Num", "Side"]] = df_test["Cabin"].str.split("/", expand = True)

df_train[["Group_ID", "Person_ID"]] = df_train["PassengerId"].str.split("_", expand = True)
df_test[["Group_ID", "Person_ID"]] = df_test["PassengerId"].str.split("_", expand = True)

df_train = df_train.set_index('PassengerId')
df_test = df_test.set_index('PassengerId')

In [None]:
print("Deck：", len(df_train["Deck"].unique()))
print("Num：", len(df_train["Num"].unique()))
print("HomePlanet：", len(df_train["HomePlanet"].unique()))
print("Destination：", len(df_train["Destination"].unique()))
print("Group_ID：", len(df_train["Group_ID"].unique()))

### Analysis relationship between group and other features and fillna

In [None]:
train_analysis = df_train[["CryoSleep", "VIP" ,"Group_ID", "Deck", "Num", "HomePlanet", "Destination","Side"]].dropna()
groups = train_analysis["Group_ID"].unique()

HomePlanetDiffer = 0
DestinationDiffer = 0
NumDiffer = 0
DeckDiffer = 0
SideDiffer = 0
VIPDiffer = 0
CryoSleepDiffer = 0

for group in groups:
    subset = train_analysis[train_analysis["Group_ID"] == group]
    if len(subset["HomePlanet"].unique()) > 1:
        HomePlanetDiffer += 1

    if len(subset["Destination"].unique()) > 1:
        DestinationDiffer += 1

    if len(subset["Num"].unique()) > 1:
        NumDiffer += 1

    if len(subset["Deck"].unique()) > 1:
        DeckDiffer += 1

    if len(subset["Side"].unique()) > 1:
        SideDiffer += 1

    if len(subset["VIP"].unique()) > 1:
        VIPDiffer += 1
 
    if len(subset["CryoSleep"].unique()) > 1:
        CryoSleepDiffer += 1
        
print(f"HomePlanet not same:{HomePlanetDiffer}")
print(f"Destination not same:{DestinationDiffer}")
print(f"Num not same:{NumDiffer}")
print(f"Deck not same:{DeckDiffer}")
print(f"Side not same:{SideDiffer}")
print(f"VIP not same:{VIPDiffer}")
print(f"CryoSleep not same:{CryoSleepDiffer}")

# HomePlanet and Side has correltion with groups
# VIP has some related

In [None]:
df_train["HomePlanet"] = df_train["HomePlanet"].fillna(df_train.groupby("Group_ID")["HomePlanet"].transform("first"))
df_train["Side"] = df_train["Side"].fillna(df_train.groupby("Group_ID")["Side"].transform("first"))

df_test["HomePlanet"] = df_test["HomePlanet"].fillna(df_test.groupby("Group_ID")["HomePlanet"].transform("first"))
df_test["Side"] = df_test["Side"].fillna(df_test.groupby("Group_ID")["Side"].transform("first"))

### Analysis Destination & HomePlanet & fillna

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6))= plt.subplots(3,2, figsize=(14, 14))
sns.countplot(data=df_train, x="Destination", hue="HomePlanet", ax=ax1)
ax1.set_title("relationship between Destination and HomePlanet")

sns.countplot(data=df_train, x="Destination", hue="Deck", ax=ax2)
ax2.set_title("relationship between Destination and deck")

sns.countplot(data=df_train, x="Destination", hue="Side", ax=ax3)
ax3.set_title("relationship between Destination and Side")

sns.countplot(data=df_train, x="Destination", hue="CryoSleep", ax=ax4)
ax4.set_title("relationship between Destination and CryoSleep")

sns.countplot(data=df_train, x="Destination", hue="VIP", ax=ax5)
ax5.set_title("relationship between Destination and VIP")

sns.countplot(data=df_train, x="Destination", hue="Transported", ax=ax6)
ax6.set_title("relationship between Destination and Transported")

plt.tight_layout()

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6))= plt.subplots(3, 2, figsize=(14, 14))

sns.countplot(data=df_train, x="HomePlanet", hue="Destination", ax=ax1)
ax1.set_title("relationship between HomePlanet and Destination")

sns.countplot(data=df_train, x="HomePlanet", hue="Deck", ax=ax2)
ax2.set_title("relationship between Homeplanet and deck")

sns.countplot(data=df_train, x="HomePlanet", hue="Side", ax=ax3)
ax3.set_title("relationship between HomePlanet and Side")

sns.countplot(data=df_train, x="HomePlanet", hue="CryoSleep", ax=ax4)
ax4.set_title("relationship between HomePlanet and CryoSleep")

sns.countplot(data=df_train, x="HomePlanet", hue="VIP", ax=ax5)
ax5.set_title("relationship between HomePlanet and VIP")

sns.countplot(data=df_train, x="HomePlanet", hue="Transported", ax=ax6)
ax6.set_title("relationship between HomePlanet and transported")

plt.tight_layout()

In [None]:
df_train.loc[((df_train["Deck"] == "C")| (df_train["Deck"] == "A")| (df_train["Deck"] == "B")| (df_train["Deck"] == "T"))
            & (df_train["HomePlanet"].isna()), "HomePlanet"] = "Europa"
df_train.loc[(df_train["Deck"] == "G") & (df_train["HomePlanet"].isna()), "HomePlanet"] = "Earth"
df_train.loc[(df_train["Deck"] == "T") & (df_train["Destination"].isna()), "Destination"] = "TRAPPIST-1e"
df_train.loc[(df_train["HomePlanet"] == "Earth") & (df_train["VIP"].isna()), "VIP"] = False

df_test.loc[((df_test["Deck"] == "C")| (df_test["Deck"] == "A")| (df_test["Deck"] == "B")| (df_test["Deck"] == "T")) 
            & (df_test["HomePlanet"].isna()), "HomePlanet"] = "Europa"
df_test.loc[(df_test["Deck"] == "G") & (df_test["HomePlanet"].isna()), "HomePlanet"] = "Earth"
df_test.loc[(df_test["Deck"] == "T") & (df_test["Destination"].isna()), "Destination"] = "TRAPPIST-1e"
df_test.loc[(df_test["HomePlanet"] == "Earth") & (df_test["VIP"].isna()), "VIP"] = False

In [None]:
from scipy.stats import chi2_contingency

def destination_correlation(df, col):
    contingency_table = pd.crosstab(df["Destination"], df[col])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    r, k = contingency_table.shape
    cramers = np.sqrt(phi2 / min(r-1, k-1))
    
    return [p_value, cramers]

df_train_dropna = df_train.dropna()
df_test_dropna = df_test.dropna()

columns = ["HomePlanet", "Deck", "CryoSleep"]

for col in columns:
    corr = destination_correlation(df_train_dropna, col)
    print(f"Destination&{col}_train: p-value is {corr[0]},cramers is {corr[1]}")

    corr = destination_correlation(df_test_dropna, col)
    print(f"Destination&{col}_test: p-value is {corr[0]},cramers is {corr[1]}")

In [None]:
def homeplanet_correlation(df, col):
    contingency_table = pd.crosstab(df["HomePlanet"], df[col])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    r, k = contingency_table.shape
    cramers = np.sqrt(phi2 / min(r-1, k-1))
    
    return [p_value, cramers]

columns = ["Destination", "Deck", "CryoSleep"]

for col in columns:
    corr = homeplanet_correlation(df_train_dropna, col)
    print(f"HomePlanet&{col}_train: p-value is {corr[0]},cramers is {corr[1]}")

    corr = homeplanet_correlation(df_test_dropna, col)
    print(f"HomePlanet&{col}_test: p-value is {corr[0]},cramers is {corr[1]}")

In [None]:
np.random.seed(42)

def fillna_destination(df, home):
    df_homeDest_counts = df[df["HomePlanet"] == home]["Destination"].value_counts(normalize=True)
    
    if df_homeDest_counts.empty:
        dest_counts = df["Destination"].value_counts(normalize=True)
        df.loc[(df["HomePlanet"] == home) & (df["Destination"].isna()), "Destination"] = np.random.choice(
            dest_counts.index,
            size=df[(df["HomePlanet"] == home) & (df["Destination"].isna())].shape[0],
            p=dest_counts.values
        )

    elif len(df_homeDest_counts) > 1:
        df.loc[(df["HomePlanet"] == home) & (df["Destination"].isna()), "Destination"] = np.random.choice(
            df_homeDest_counts.index,
            size=df[(df["HomePlanet"] == home) & (df["Destination"].isna())].shape[0],
            p=df_homeDest_counts.values
        )

    else:
        df.loc[(df["HomePlanet"] == home) & (df["Destination"].isna()), "Destination"] = df_homeDest_counts.index[0]
    return df

for home in df_train["HomePlanet"].unique():
    df_train = fillna_destination(df_train, home)
    df_test = fillna_destination(df_test, home)

In [None]:
np.random.seed(42)

def fillna_homeplanet(df, deck):
    df_deckHome_counts = df[df["Deck"] == deck]["HomePlanet"].value_counts(normalize=True)
    
    if df_deckHome_counts.empty:
        home_counts = df["HomePlanet"].value_counts(normalize=True)
        df.loc[(df["Deck"] == deck) & (df["HomePlanet"].isna()), "HomePlanet"] = np.random.choice(
            home_counts.index,
            size=df[(df["Deck"] == deck) & (df["HomePlanet"].isna())].shape[0],
            p=home_counts.values
        )

    elif len(df_deckHome_counts) > 1:
        df.loc[(df["Deck"] == deck) & (df["HomePlanet"].isna()), "HomePlanet"] = np.random.choice(
            df_deckHome_counts.index,
            size=df[(df["Deck"] == deck) & (df["HomePlanet"].isna())].shape[0],
            p=df_deckHome_counts.values
        )

    else:
        df.loc[(df["Deck"] == deck) & (df["HomePlanet"].isna()), "HomePlanet"] = df_deckHome_counts.index[0]
    return df

for deck in df_train["Deck"].unique():
    df_train = fillna_homeplanet(df_train, deck)
    df_test = fillna_homeplanet(df_test, deck)

In [None]:
np.random.seed(42)
df_train_side_counts = df_train["Side"].value_counts(normalize=True)
df_train["Side"] = df_train["Side"].apply(
    lambda x: np.random.choice(df_train_side_counts.index, p=df_train_side_counts.values) if pd.isna(x) else x
)

df_test_side_counts = df_test["Side"].value_counts(normalize=True)
df_test["Side"] = df_test["Side"].apply(
    lambda x: np.random.choice(df_test_side_counts.index, p=df_test_side_counts.values) if pd.isna(x) else x
)

In [None]:
df_train.isnull().sum()

### Analysis Spending and other features

In [None]:
df_train["TotalSpend"] = df_train[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1, skipna=True, min_count=1)
df_test["TotalSpend"] = df_test[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1, skipna=True, min_count=1)

# new feature:TotalSpend

In [None]:
print(df_train[df_train["CryoSleep"] == True]["TotalSpend"].sum(skipna=True))
print(df_test[df_test["CryoSleep"] == True]["TotalSpend"].sum(skipna=True))

# CryoSleep don't spend

In [None]:
print(df_train[df_train["Age"] < 13]["TotalSpend"].sum(skipna=True))
print(df_test[df_test["Age"] < 13]["TotalSpend"].sum(skipna=True))

print(df_train[df_train["Age"] < 14]["TotalSpend"].sum(skipna=True))
print(df_test[df_test["Age"] < 14]["TotalSpend"].sum(skipna=True))

# age under 13 don't spend

In [None]:
print(df_train[df_train["VIP"] == True]["RoomService"].mean(skipna=True))
print(df_train[df_train["VIP"] == True]["FoodCourt"].mean(skipna=True))
print(df_train[df_train["VIP"] == True]["ShoppingMall"].mean(skipna=True))
print(df_train[df_train["VIP"] == True]["Spa"].mean(skipna=True))
print(df_train[df_train["VIP"] == True]["VRDeck"].mean(skipna=True))

In [None]:
def age_under13_spend(df, col):
    df.loc[((df["CryoSleep"] == True) | (df["Age"] < 13)) & (df[col].isna()), col] = 0
    return df

spend_col = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend"]
for col in spend_col:
    df_train = age_under13_spend(df_train, col)
    df_test = age_under13_spend(df_test, col)

In [None]:
print(len((df_train[(df_train["Age"]>=13) & (df_train["TotalSpend"]==0) & (df_train["CryoSleep"] == True)])))
print(len((df_train[(df_train["Age"]>=13) & (df_train["TotalSpend"]==0) & (df_train["CryoSleep"] == False)])))

In [None]:
df_train.loc[(df_train["Age"] >= 13) & 
             (df_train["TotalSpend"] == 0) & 
             (df_train["CryoSleep"].isna()), "CryoSleep"] = df_train["CryoSleep"].mode()[0]

df_test.loc[(df_test["Age"] >= 13) & 
             (df_test["TotalSpend"] == 0) & 
             (df_test["CryoSleep"].isna()), "CryoSleep"] = df_test["CryoSleep"].mode()[0]

In [None]:
train_median_age = df_train.loc[(df_train["TotalSpend"] == 0) & 
    (df_train["CryoSleep"] == False) & (df_train["Age"].notna()), "Age"].median()
df_train.loc[(df_train["TotalSpend"] == 0) & (df_train["CryoSleep"] == False) & 
    (df_train["Age"].isna()), "Age"] = train_median_age

test_median_age = df_test.loc[(df_test["TotalSpend"] == 0) & 
    (df_test["CryoSleep"] == False) & (df_test["Age"].notna()), "Age"].median()
df_test.loc[(df_test["TotalSpend"] == 0) & (df_test["CryoSleep"] == False) & 
    (df_test["Age"].isna()), "Age"] = test_median_age

df_train["Age"] = df_train["Age"].fillna(df_train["Age"].median())
df_test["Age"] = df_test["Age"].fillna(df_test["Age"].median())

In [None]:
df_train.isnull().sum()

### cryo sleep and others

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
cryo_train = df_train.groupby("Deck")["CryoSleep"].apply(lambda x: (x == True).mean()).reset_index(name="Proportion_train")
cryo_test = df_test.groupby("Deck")["CryoSleep"].apply(lambda x: (x == True).mean()).reset_index(name="Proportion_test")

sns.barplot(data=cryo_train, x="Deck", y="Proportion_train", ax=ax1)
ax1.set_title("train-Proportion of CryoSleep in Each Deck")

sns.barplot(data=cryo_test, x="Deck", y="Proportion_test", ax=ax2)
ax2.set_title("test-Proportion of CryoSleep in Each Deck")

In [None]:
np.random.seed(42)

def fillna_cryo(df, deck):
    df_deckCryo_counts = df[df["Deck"] == deck]["CryoSleep"].value_counts(normalize=True)
    
    if df_deckCryo_counts.empty:
        cryo_counts = df["CryoSleep"].value_counts(normalize=True)
        df.loc[(df["Deck"] == deck) & (df["CryoSleep"].isna()), "CryoSleep"] = np.random.choice(
            cryo_counts.index,
            size=df[(df["Deck"] == deck) & (df["CryoSleep"].isna())].shape[0],
            p=cryo_counts.values
        )

    elif len(df_deckCryo_counts) > 1:
        df.loc[(df["Deck"] == deck) & (df["CryoSleep"].isna()), "CryoSleep"] = np.random.choice(
            df_deckCryo_counts.index,
            size=df[(df["Deck"] == deck) & (df["CryoSleep"].isna())].shape[0],
            p=df_deckCryo_counts.values
        )

    else:
        df.loc[(df["Deck"] == deck) & (df["CryoSleep"].isna()), "CryoSleep"] = df_deckCryo_counts.index[0]
    return df

for deck in df_train["Deck"].unique():
    df_train = fillna_cryo(df_train, deck)
    df_test = fillna_cryo(df_test, deck)

### Analysis VIP and other features

In [None]:
print(len(df_train[(df_train["Age"] < 13) & (df_train["VIP"] == True)]))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
vip_train = df_train.groupby("Deck")["VIP"].apply(lambda x: (x == True).mean()).reset_index(name="Proportion_train")
vip_test = df_test.groupby("Deck")["VIP"].apply(lambda x: (x == True).mean()).reset_index(name="Proportion_test")

sns.barplot(data=vip_train, x="Deck", y="Proportion_train", ax=ax1)
ax1.set_title("train-Proportion of VIP in Each Deck")

sns.barplot(data=vip_test, x="Deck", y="Proportion_test", ax=ax2)
ax2.set_title("test-Proportion of VIP in Each Deck")

In [None]:
df_train.loc[df_train["Age"] < 13 & df_train["VIP"].isna(), "VIP"] = False

df_train["VIP"] = df_train["VIP"].fillna(df_train.groupby("Group_ID")["VIP"].transform("first"))

In [None]:
#np.random.seed(42)

def fillna_VIP(df, deck):
    df_deckVIP_counts = df[df["Deck"] == deck]["VIP"].value_counts(normalize=True)
    
    if df_deckVIP_counts.empty:
        VIP_counts = df["VIP"].value_counts(normalize=True)
        df.loc[(df["Deck"] == deck) & (df["VIP"].isna()), "VIP"] = np.random.choice(
            VIP_counts.index,
            size=df[(df["Deck"] == deck) & (df["VIP"].isna())].shape[0],
            p=VIP_counts.values
        )

    elif len(df_deckVIP_counts) > 1:
        df.loc[(df["Deck"] == deck) & (df["VIP"].isna()), "VIP"] = np.random.choice(
            df_deckVIP_counts.index,
            size=df[(df["Deck"] == deck) & (df["VIP"].isna())].shape[0],
            p=df_deckVIP_counts.values
        )

    else:
        df.loc[(df["Deck"] == deck) & (df["VIP"].isna()), "VIP"] = df_deckVIP_counts.index[0]
    return df

for deck in df_train["Deck"].unique():
    df_train = fillna_VIP(df_train, deck)
    df_test = fillna_VIP(df_test, deck)

### Analysis Deck and other features

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2 ,figsize=(14, 10))

sns.countplot(data=df_train, x="Deck", hue="VIP", ax=ax1)
ax1.set_title("relationship between deck and VIP")

sns.countplot(data=df_train, x="Deck", hue="CryoSleep", ax=ax2)
ax2.set_title("relationship between deck and cryosleep")

sns.countplot(data=df_train, x="Deck", hue="HomePlanet", ax=ax3)
ax3.set_title("relationship between deck and HomePlanet")

sns.countplot(data=df_train, x="Deck", hue="Destination", ax=ax4)
ax4.set_title("relationship between deck and Destination")

In [None]:
np.random.seed(42)

train_earthDeck_counts = df_train[df_train["HomePlanet"] == "Earth"]["Deck"].value_counts(normalize=True)
df_train.loc[(df_train["HomePlanet"] == "Earth") & (df_train["Deck"].isna()), "Deck"] = np.random.choice(
    train_earthDeck_counts.index,
    size=df_train[(df_train["HomePlanet"] == "Earth") & (df_train["Deck"].isna())].shape[0],
    p=train_earthDeck_counts.values
)

test_earthDeck_counts = df_test[df_test["HomePlanet"] == "Earth"]["Deck"].value_counts(normalize=True)
df_test.loc[(df_test["HomePlanet"] == "Earth") & (df_test["Deck"].isna()), "Deck"] = np.random.choice(
    test_earthDeck_counts.index,
    size=df_test[(df_test["HomePlanet"] == "Earth") & (df_test["Deck"].isna())].shape[0],
    p=test_earthDeck_counts.values
)

# Earth only in deck:FGE

In [None]:
np.random.seed(42)

train_MarsDeck_counts = df_train[df_train["HomePlanet"] == "Mars"]["Deck"].value_counts(normalize=True)
df_train.loc[(df_train["HomePlanet"] == "Mars") & (df_train["Deck"].isna()), "Deck"] = np.random.choice(
    train_MarsDeck_counts.index,
    size=df_train[(df_train["HomePlanet"] == "Mars") & (df_train["Deck"].isna())].shape[0],
    p=train_MarsDeck_counts.values
)

test_MarsDeck_counts = df_test[df_test["HomePlanet"] == "Mars"]["Deck"].value_counts(normalize=True)
df_test.loc[(df_test["HomePlanet"] == "Mars") & (df_test["Deck"].isna()), "Deck"] = np.random.choice(
    test_MarsDeck_counts.index,
    size=df_test[(df_test["HomePlanet"] == "Mars") & (df_test["Deck"].isna())].shape[0],
    p=test_MarsDeck_counts.values
)
# Mars only in deck:FED

In [None]:
np.random.seed(42)

train_EuropaDeck_counts = df_train[df_train["HomePlanet"] == "Europa"]["Deck"].value_counts(normalize=True)
df_train.loc[(df_train["HomePlanet"] == "Europa") & (df_train["Deck"].isna()), "Deck"] = np.random.choice(
    train_EuropaDeck_counts.index,
    size=df_train[(df_train["HomePlanet"] == "Europa") & (df_train["Deck"].isna())].shape[0],
    p=train_EuropaDeck_counts.values
)

test_EuropaDeck_counts = df_test[df_test["HomePlanet"] == "Europa"]["Deck"].value_counts(normalize=True)
df_test.loc[(df_test["HomePlanet"] == "Europa") & (df_test["Deck"].isna()), "Deck"] = np.random.choice(
    test_EuropaDeck_counts.index,
    size=df_test[(df_test["HomePlanet"] == "Europa") & (df_test["Deck"].isna())].shape[0],
    p=test_EuropaDeck_counts.values
)

### Filling spending feature

In [None]:
spend_col = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend"]

df_train.fillna(df_train[spend_col].median(), inplace=True)
df_test.fillna(df_test[spend_col].median(), inplace=True)

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
sns.boxplot(df_train[spend_col], orient="h", ax=ax1)
ax1.set_title("train spending boxplot")
sns.boxplot(df_test[spend_col], orient="h", ax=ax2)
ax2.set_title("test spending boxplot")

In [None]:
df_train[spend_col] =  np.log1p(df_train[spend_col])
df_test[spend_col] =  np.log1p(df_test[spend_col])

In [None]:
df_train.isnull().sum()

In [None]:
df_train.head(5)

### Dropping

In [None]:
drop_col = ["Cabin", "Name", "Num"]
df_train = df_train.drop(columns=drop_col)
df_test = df_test.drop(columns=drop_col)

drop_col_temp = ["RoomService", "ShoppingMall", "VRDeck", "Spa", "FoodCourt"]
df_train_temp = df_train.drop(columns=drop_col_temp)
df_test_temp = df_test.drop(columns=drop_col_temp)

In [None]:
df_train.dropna(inplace=True)
df_train_temp.dropna(inplace=True)

df_test.fillna(df_test.median(numeric_only=True), inplace=True)
for col in df_test.select_dtypes(include="object"):
    df_test[col].fillna(df_test[col].mode()[0], inplace=True)


df_test_temp.fillna(df_test_temp.median(numeric_only=True), inplace=True)
for col in df_test_temp.select_dtypes(include="object"):
    df_test_temp[col].fillna(df_test_temp[col].mode()[0], inplace=True)


### encoding

In [None]:
trans_train = df_train.groupby("Deck")["Transported"].apply(lambda x: (x == True).mean()).reset_index(name="Proportion_train")

sns.barplot(data=trans_train, x="Deck", y="Proportion_train")
plt.title("Proportion of Transported in Each Deck")

In [None]:
trans_train = trans_train.sort_values(by="Proportion_train", ascending=False)
deck_encoding_map = {deck: rank for rank, deck in enumerate(trans_train["Deck"], start=1)}
df_train["Deck"] = df_train["Deck"].map(deck_encoding_map)
df_test["Deck"] = df_test["Deck"].map(deck_encoding_map)

df_train_temp["Deck"] = df_train["Deck"].map(deck_encoding_map)
df_test_temp["Deck"] = df_test["Deck"].map(deck_encoding_map)

In [None]:
df_train = pd.get_dummies(df_train, columns=["HomePlanet","Destination","CryoSleep","VIP","Side"])
df_test = pd.get_dummies(df_test, columns=["HomePlanet","Destination","CryoSleep","VIP","Side"])

df_train_temp = pd.get_dummies(df_train_temp, columns=["HomePlanet","Destination","CryoSleep","VIP","Side"])
df_test_temp = pd.get_dummies(df_test_temp, columns=["HomePlanet","Destination","CryoSleep","VIP","Side"])

In [None]:
for col in df_train.columns:
    if df_train[col].dtype == "object" or df_train[col].dtype == "bool":
        df_train[col] = df_train[col].astype(int)

for col in df_test.columns:
    if df_test[col].dtype == "object" or df_test[col].dtype == "bool":
        df_test[col] = df_test[col].astype(int)

for col in df_train_temp.columns:
    if df_train_temp[col].dtype == "object" or df_train_temp[col].dtype == "bool":
        df_train_temp[col] = df_train_temp[col].astype(int)

for col in df_test_temp.columns:
    if df_test_temp[col].dtype == "object" or df_test_temp[col].dtype == "bool":
        df_test_temp[col] = df_test_temp[col].astype(int)

In [None]:
df_train.info()

### splitting

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [None]:
X, y = df_train.drop('Transported', axis=1), df_train['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = df_test

# scale all numerical columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
########################################################################################################

X_temp, y_temp = df_train_temp.drop('Transported', axis=1), df_train_temp['Transported']
X_train_temp, X_val_temp, y_train_temp, y_val_temp = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

X_test_temp = df_test_temp

# scale all numerical columns
scaler = StandardScaler()
X_train_scaled_temp = scaler.fit_transform(X_train_temp)
X_val_scaled_temp = scaler.transform(X_val_temp)
X_test_scaled_temp = scaler.transform(X_test_temp)

### logestic regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)
log_reg_params = {'C': np.arange(0.01, 10, 0.5)}

log_reg_random = RandomizedSearchCV(log_reg, log_reg_params, cv=10, n_jobs=-1, verbose=1)
log_reg_random.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", log_reg_random.best_params_)
print("Best mean cross-validation score: {:.3f}".format(log_reg_random.best_score_))

best_reg = log_reg_random.best_estimator_
y_pred = best_reg.predict(X_val_scaled)
print(f"random-Accuracy for best regression model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'C': 7.51}
#Best mean cross-validation score: 0.775
#random-Accuracy for best regression model: 0.7760506620610248

In [None]:
# Logistic Regression Grid Search

log_reg_grid_params = {
    'C': np.arange(log_reg_random.best_params_['C']*0.8, log_reg_random.best_params_['C']*1.2, 0.1)
}

log_reg_grid = GridSearchCV(log_reg, log_reg_grid_params, cv=5, n_jobs=-1, verbose=1)
log_reg_grid.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", log_reg_grid.best_params_)
print("Best mean cross-validation score: {:.3f}".format(log_reg_grid.best_score_))

best_reg = log_reg_grid.best_estimator_
y_pred = best_reg.predict(X_val_scaled)
print(f"grid-Accuracy for best regression model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'C': 6.008}
#Best mean cross-validation score: 0.776
#grid-Accuracy for best regression model: 0.7760506620610248

### random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_params = {
    'n_estimators': np.arange(100, 1000, 50),
    'max_depth': np.arange(1, 20),
    'min_samples_split': np.arange(2, 20),
    'min_samples_leaf': np.arange(3, 15),
    'max_features': ['sqrt', 'log2', None]
}

rf_model = RandomForestClassifier()

rf_randomized = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=rf_params,
    n_iter=10,
    cv=10,
    n_jobs=-1,
    verbose=1,
    random_state=42
)
rf_randomized.fit(X_train, y_train)

print("Best hyperparameters: ", rf_randomized.best_params_)
print("Best mean cross-validation score: {:.3f}".format(rf_randomized.best_score_))

best_rf = rf_randomized.best_estimator_
y_pred = best_rf.predict(X_val)
print(f"random-Accuracy for best RandomForest model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'n_estimators': 100, 'min_samples_split': 8, 'min_samples_leaf': 8, 'max_features': 'log2', 'max_depth': 13}
#Best mean cross-validation score: 0.803
#random-Accuracy for best RandomForest model: 0.7944732297063903

In [None]:
# Random Forest Grid Search

rf_grid_params = {
    'n_estimators': [rf_randomized.best_params_['n_estimators'] - 50, 
                     rf_randomized.best_params_['n_estimators'], 
                     rf_randomized.best_params_['n_estimators'] + 50],
    'max_depth': [rf_randomized.best_params_['max_depth'] - 2, 
                  rf_randomized.best_params_['max_depth'], 
                  rf_randomized.best_params_['max_depth'] + 2],
    'min_samples_split': [rf_randomized.best_params_['min_samples_split'] - 2, 
                          rf_randomized.best_params_['min_samples_split'], 
                          rf_randomized.best_params_['min_samples_split'] + 2]
}

rf_grid = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_grid_params,
    cv=5,
    n_jobs=-1,
    verbose=2
)
rf_grid.fit(X_train, y_train)

print("Best hyperparameters: ", rf_grid.best_params_)
print(f"Best Cross-Validation Score: {rf_grid.best_score_:.3f}")

best_rf = rf_grid.best_estimator_
y_pred = best_rf.predict(X_val)
print(f"grid-Accuracy for best RandomForest model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'max_depth': 15, 'min_samples_split': 6, 'n_estimators': 100}
#Best Cross-Validation Score: 0.803
#grid-Accuracy for best RandomForest model: 0.8002302820955671

### catboost

In [None]:
from catboost import CatBoostClassifier

# CatBoost Randomized Search
catboost_model = CatBoostClassifier(silent=True)
catboost_params = {
    'learning_rate': np.arange(0.005, 0.06, 0.005),
    'depth':  np.arange(2, 10,2),
    'l2_leaf_reg': [0.1, 0.5, 0.7],
    'random_strength': [0.1, 0.2, 0.3],
    'max_bin': [100, 150, 200],
    # 'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],
    'bootstrap_type': ['Bayesian', 'Bernoulli'],
    'one_hot_max_size': [10, 50, 70],
    'iterations': np.arange(100, 600, 50),
}

catboost_random = RandomizedSearchCV(catboost_model, catboost_params, cv=10, n_jobs=-1, verbose=1, random_state=42)
catboost_random.fit(X_train, y_train)

print("Best hyperparameters: ", catboost_random.best_params_)
print(f"Best Cross-Validation Score: {catboost_random.best_score_:.3f}")

best_cat = catboost_random.best_estimator_
y_pred = best_cat.predict(X_val)
print(f"random-Accuracy for best CatBoost model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'random_strength': 0.2, 'one_hot_max_size': 10, 'max_bin': 100, 'learning_rate': 0.04, 'l2_leaf_reg': 0.5, 'iterations': 400, 'depth': 5, 'bootstrap_type': 'Bayesian'}
#Best Cross-Validation Score: 0.805
#random-Accuracy for best CatBoost model: 0.8100172711571675

In [None]:
# CatBoost Grid Search
catboost_grid_params = {
    'iterations': [catboost_random.best_params_['iterations']],
    'depth': np.arange(catboost_random.best_params_['depth']-1, catboost_random.best_params_['depth']+2, 1)
}

catboost_grid = GridSearchCV(catboost_model, catboost_grid_params, cv=5, n_jobs=-1, verbose=1)
catboost_grid.fit(X_train, y_train)

print("Best hyperparameters: ", catboost_grid.best_params_)
print(f"Best Cross-Validation Score: {catboost_grid.best_score_:.3f}")

best_cat = catboost_grid.best_estimator_
y_pred = best_cat.predict(X_val)
print(f"grid-Accuracy for best CatBoost model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'max_iter': 200, 'learning_rate_init': 0.0005, 'hidden_layer_sizes': (50,), 'alpha': 0.0031}
#Best Cross-Validation Score: 0.779
#random-Accuracy for best mlp model: 0.7800805987334485

### mlp

In [None]:
from sklearn.neural_network import MLPClassifier

# MLP Randomized Search

mlp_params = {
    'activation': 'relu',
    'solver': 'adam',
    'early_stopping': True
}

mlp_model = MLPClassifier(**mlp_params)
#mlp_params = {
#    'hidden_layer_sizes': [(64,), (128,), (64, 32)],
#    'alpha': np.arange(0.0001, 0.01, 0.001)
#}

mlp_params_tuned = {
    'hidden_layer_sizes': [(50,),(64,),(64, 32)],
    'alpha': np.arange(0.0001, 0.01, 0.001),
    'learning_rate_init': [0.0003, 0.001, 0.001],
    'max_iter': [200, 500],
}

mlp_random = RandomizedSearchCV(mlp_model, mlp_params_tuned, cv=10, n_jobs=-1, verbose=1, random_state=42)
mlp_random.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", mlp_random.best_params_)
print(f"Best Cross-Validation Score: {mlp_random.best_score_:.3f}")

best_mlp = mlp_random.best_estimator_
y_pred = best_mlp.predict(X_val_scaled)
print(f"random-Accuracy for best mlp model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'max_iter': 200, 'learning_rate_init': 0.0005, 'hidden_layer_sizes': (50,), 'alpha': 0.0031}
#Best Cross-Validation Score: 0.779
#random-Accuracy for best mlp model: 0.7800805987334485

In [None]:
# MLP Grid Search
mlp_grid_params = {
    'hidden_layer_sizes': [mlp_random.best_params_['hidden_layer_sizes']],
    'alpha': np.arange(mlp_random.best_params_['alpha']*0.8, mlp_random.best_params_['alpha']*1.2, 0.0001)
}

mlp_grid = GridSearchCV(mlp_model, mlp_grid_params, cv=5, n_jobs=-1, verbose=1)
mlp_grid.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", mlp_grid.best_params_)
print(f"Best Cross-Validation Score: {mlp_grid.best_score_:.3f}")

best_mlp = mlp_grid.best_estimator_
y_pred = best_mlp.predict(X_val_scaled)
print(f"grid-Accuracy for best mlp model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'alpha': 0.0034799999999999983, 'hidden_layer_sizes': (50,)}
#Best Cross-Validation Score: 0.784
#grid-Accuracy for best mlp model: 0.7875647668393783

### XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_params_tuned = {
    'n_estimators': np.arange(50, 800, 50),
    'max_depth': np.arange(3, 15, 2),
    'learning_rate': np.arange(0.01, 0.05, 0.004),
    'subsample': np.arange(0.1, 1.1, 0.1),
    'colsample_bytree': np.arange(0.1, 1.1, 0.1),
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 1.0, 10],
    'gamma': [0, 0.1, 0.5, 1.0]
}

xgb_model = XGBClassifier(tree_method='gpu_hist')

xgb_randomized = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=xgb_params_tuned,
    n_iter=10,
    cv=10,
    n_jobs=-1, 
    verbose=1,
    random_state=42
)

xgb_randomized.fit(X_train, y_train)

print("Best hyperparameters: ", xgb_randomized.best_params_)
print("Best mean cross-validation score: {:.3f}".format(xgb_randomized.best_score_))

best_xgb = xgb_randomized.best_estimator_
y_pred = best_xgb.predict(X_val)
print(f"random-Accuracy for best XGBoost model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters:  {'subsample': 0.7000000000000001, 'reg_lambda': 0, 'reg_alpha': 0.5, 'n_estimators': 450, 'max_depth': 3, 'learning_rate': 0.034, 'gamma': 0, 'colsample_bytree': 0.30000000000000004}
#Best mean cross-validation score: 0.806
#random-Accuracy for best XGBoost model: 0.8002302820955671

In [None]:
# XGBoost Grid Search

grid_params = {
    'n_estimators': [xgb_randomized.best_params_['n_estimators'] - 50, xgb_randomized.best_params_['n_estimators'], xgb_randomized.best_params_['n_estimators'] + 50],
    'max_depth': [xgb_randomized.best_params_['max_depth'] - 2, xgb_randomized.best_params_['max_depth'], xgb_randomized.best_params_['max_depth'] + 2],
    'learning_rate': [xgb_randomized.best_params_['learning_rate'] - 0.01, xgb_randomized.best_params_['learning_rate'], xgb_randomized.best_params_['learning_rate'] + 0.01]
}

xgb_grid = GridSearchCV(
    estimator=XGBClassifier(tree_method='gpu_hist'),
    param_grid=grid_params,
    cv=5, 
    n_jobs=-1, 
    verbose=1
)

xgb_grid.fit(X_train, y_train)

print("Best hyperparameters : ", xgb_grid.best_params_)
print("Best mean cross-validation score: {:.3f}".format(xgb_grid.best_score_))

best_xgb = xgb_grid.best_estimator_
y_pred = best_xgb.predict(X_val)
print(f"grid-Accuracy for best XGBoost model: {accuracy_score(y_val, y_pred)}")

#Best hyperparameters :  {'learning_rate': 0.044000000000000004, 'max_depth': 5, 'n_estimators': 500}
#Best mean cross-validation score: 0.806
#grid-Accuracy for best XGBoost model: 0.8088658606793322

### stacking

In [None]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[
        ('log_reg', log_reg_grid.best_estimator_),
        ('rf', rf_grid.best_estimator_),
        ('catboost', catboost_grid.best_estimator_),
        ('mlp', mlp_grid.best_estimator_),
        ('xgb', xgb_grid.best_estimator_)
    ],
    final_estimator=LogisticRegression()
)

stacking_clf.fit(X_train_scaled, y_train)

y_pred = stacking_clf.predict(X_val_scaled)
print(f"Accuracy for best stacking model: {accuracy_score(y_val, y_pred)}")

#Accuracy for best stacking model: 0.8105929763960852

In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('mlp', mlp_grid),
    ('xgb', xgb_grid),
    ('cat', catboost_grid),
    ('log_reg', log_reg_grid),
    ('rf', rf_grid)
], voting='soft')

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_val)
print(f"Accuracy for best ensemble model: {accuracy_score(y_val, y_pred)}")

#Accuracy for best ensemble model: 0.8048359240069085

In [None]:
# Make predictions on the test dataset
predictions = catboost_grid.predict(X_test)

submission = pd.DataFrame({'PassengerId': df_test.index, 'Transported': predictions})
submission['Transported'] = submission['Transported'].replace(0, 'False')
submission['Transported'] = submission['Transported'].replace(1, 'True')

submission.to_csv('submission.csv', index=False)
submission.head()

In [None]:
submission.info()

In [None]:
import joblib

joblib.dump(rf_grid, "spaceship_rf_deckOrdinal_totalspend.pkl")
joblib.dump(catboost_grid, "spaceship_cat_deckOrdinal_totalspend.pkl")
joblib.dump(mlp_grid, "spaceship_mlp_deckOrdinal_totalspend.pkl")
joblib.dump(xgb_grid, "spaceship_xgb_deckOrdinal_totalspend.pkl")
joblib.dump(voting_clf, "spaceship_vote_deckOrdinal_totalspend.pkl")
joblib.dump(stacking_clf, "spaceship_stack_deckOrdinalp_totalspend.pkl")