In [None]:
import numpy as np
import pandas as pd

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sub = pd.read_csv('gender_submission.csv')

In [None]:
df_train['HasCabin'] = df_train['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

df_train.drop('Cabin', axis=1, inplace=True)

In [None]:
df_test['HasCabin'] = df_test['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

df_test.drop('Cabin', axis=1, inplace=True)

In [None]:
df_train = df_train.drop('PassengerId', axis = 1)
df_test = df_test.drop('PassengerId', axis = 1)

In [None]:
df_train['Title'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
df_train['Title'].value_counts()

In [None]:
df_test['Title'] = df_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
df_test['Title'].value_counts()

In [None]:
common = ['Mr', 'Miss', 'Mrs', 'Master']

for i in [df_train, df_test]:
    i['Title'] = i['Title'].apply(lambda x: x if x in common else 'Other')

In [None]:
df_train.drop('Name', axis=1, inplace=True)
df_test.drop('Name', axis=1, inplace=True)

In [None]:
df_test.info()

In [None]:
df_train.info()

In [None]:
df_test.isna().sum()

In [None]:
df_train.isna().sum()

In [None]:
for df in [df_train, df_test]:

    df['Age'] = df['Age'].fillna(df.groupby('Title')['Age'].transform('median'))
    
    df['Age_Group'] = 'Unknown'
    df.loc[df['Age'] <= 12, 'Age_Group'] = 'Child'
    df.loc[(df['Age'] > 12) & (df['Age'] <= 19), 'Age_Group'] = 'Adolescent'
    df.loc[(df['Age'] > 19) & (df['Age'] <= 40), 'Age_Group'] = 'Young Adult'
    df.loc[(df['Age'] > 40) & (df['Age'] <= 60), 'Age_Group'] = 'Adult'
    df.loc[df['Age'] > 60, 'Age_Group'] = 'Senior'
    
    df.drop('Age', axis=1, inplace=True)

In [None]:
df_train['Embarked'] = df_train['Embarked'].fillna('S')

In [None]:
df_test['Fare'] = df_test['Fare'].fillna('7.7500')

In [None]:
df_train['Age_Group'].value_counts()

In [None]:
df_train['Pclass'].value_counts()

In [None]:
df_train['SibSp']

In [None]:
df_train['Fare'].value_counts()

In [None]:
if 'SibSp' in df_train.columns:
    df_train['IsAlone'] = ((df_train['SibSp'] + df_train['Parch']) == 0).astype(int)
    df_train = df_train.drop(['SibSp', 'Parch', 'Ticket'], axis=1)


if 'SibSp' in df_test.columns:
    df_test['IsAlone'] = ((df_test['SibSp'] + df_test['Parch']) == 0).astype(int)
    df_test = df_test.drop(['SibSp', 'Parch', 'Ticket'], axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [None]:
df_train['Sex'] = label.fit_transform(df_train['Sex'])
df_test['Sex'] = label.transform(df_test['Sex'])

In [None]:
df_train['Embarked'] = label.fit_transform(df_train['Embarked'])
df_test['Embarked'] = label.transform(df_test['Embarked'])

In [None]:
df_train['Title'] = label.fit_transform(df_train['Title'])
df_test['Title'] = label.transform(df_test['Title'])

In [None]:
df_train['Age_Group'] = label.fit_transform(df_train['Age_Group'])
df_test['Age_Group'] = label.transform(df_test['Age_Group'])

In [None]:
df_test['Fare'].info

In [None]:
df_test['Fare'] = df_test['Fare'].astype(float)

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=5, 
    min_samples_leaf=3, 
    random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)


ensemble_model = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model)],
    voting='soft'
)

In [None]:
X_train = df_train.drop('Survived', axis=1)
y_train = df_train['Survived']
X_test = df_test.copy()

In [None]:
ensemble_model.fit(X_train, y_train)


In [None]:
y_pred = ensemble_model.predict(X_test)

In [None]:
df_sub['Survived'] = y_pred

In [None]:
df_sub.to_csv('sub1.csv', index=False, index_label=False)