# Practicing Using the Following Kaggle Notebook
Boosting, Bagging, Stacking <br />
https://www.kaggle.com/code/arthurtok/introduction-to-ensembling-stacking-in-python <br />
https://www.kaggle.com/code/vbmokin/autoselection-from-20-classifier-models-l-curves#7.-Prediction-

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, StackingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Load data
train = pd.read_csv('./data/train.csv')
test  = pd.read_csv('./data/test.csv')

train['Source'] = 'train'
test['Source']   = 'test'
df = pd.concat([train, test], sort=False).reset_index(drop=True)


In [None]:
# Exploratory Data Analysis (EDA)
print("Missing values per column:")
print(df.isnull().sum())
print("\nNumeric summary statistics:")
print(df.describe())

# Identify
numeric_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nNumeric features: {numeric_cols}")
print(f"Categorical features: {categorical_cols}")

Missing values per column:
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Source            0
dtype: int64

Numeric summary statistics:
       PassengerId    Survived       Pclass          Age        SibSp  \
count  1309.000000  891.000000  1309.000000  1046.000000  1309.000000   
mean    655.000000    0.383838     2.294882    29.881138     0.498854   
std     378.020061    0.486592     0.837836    14.413493     1.041658   
min       1.000000    0.000000     1.000000     0.170000     0.000000   
25%     328.000000    0.000000     2.000000    21.000000     0.000000   
50%     655.000000    0.000000     3.000000    28.000000     0.000000   
75%     982.000000    1.000000     3.000000    39.000000     1.000000   
max    1309.000000    1.000000     3.000000    80.000000     8.000000   

             

In [None]:
# Feature Engineering (FE)

# Extract Title and LastName
df['Title']    = df['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()
df['LastName'] = df['Name'].str.split(',').str[0]

# IsWomanOrBoy flag
df['IsWomanOrBoy'] = ((df['Title'] == 'Master') | (df['Sex'] == 'female')).astype(int)

# Group‐wise Age imputation (Sex × Survived from train)
age_means = train.groupby(['Sex', 'Survived'])['Age'].mean()
overall_sex_means = train.groupby('Sex')['Age'].mean()

def fill_age(row):
    if pd.isnull(row['Age']):
        if row['Source'] == 'train':
            return age_means.loc[(row['Sex'], row['Survived'])]
        else:
            return overall_sex_means.loc[row['Sex']]
    return row['Age']

df['Age'] = df.apply(fill_age, axis=1)

# Create decade‐based buckets
df['Age2']  = (df['Age'] // 10).astype(int)
df['Fare2'] = (df['Fare'].fillna(0) // 10).astype(int)

# Family size & alone flags
df['FamilySize'] = df['SibSp'] + df['Parch']
df['Alone'] = (df['FamilySize'] == 0).astype(int)

# Cabin‐related features
df['Deck'] = df['Cabin'].str[0].fillna('M')
df['HasCabin'] = df['Cabin'].notnull().astype(int)

In [None]:
# Label‐encode categoricals
to_encode = ['Sex','Embarked','Title','LastName','Deck']
label_encoders = {}
for col in to_encode:
    le = LabelEncoder()
    df[col] = df[col].fillna('Missing').astype(str)
    le.fit(df[col])
    df[col] = le.transform(df[col])
    label_encoders[col] = le

In [None]:
# Split train and test
train_df = df[df['Source']=='train'].drop(['Source','Name','Ticket','Cabin'], axis=1)
test_df  = df[df['Source']=='test'].drop(['Source','Name','Ticket','Cabin','Survived'], axis=1)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Modeling Pipelines

# Preprocessor
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, X_train.columns)
])

# Bagging
bagging = Pipeline([
    ('prep', preprocessor),
    ('clf', BaggingClassifier(estimator=DecisionTreeClassifier(),
                              n_estimators=100, random_state=42))
])
bagging.fit(X_train, y_train)
y_bag = bagging.predict(X_test)

# AdaBoost
boosting = Pipeline([
    ('prep', preprocessor),
    ('clf', AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                               n_estimators=100, learning_rate=1.0, random_state=42))
])
boosting.fit(X_train, y_train)
y_boost = boosting.predict(X_test)

# Stacking
estimators = [
    ('lr', LogisticRegression(max_iter=2000, solver='liblinear')),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
]
stacking = Pipeline([
    ('prep', preprocessor),
    ('clf', StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=2000, solver='liblinear'),
        cv=5
    ))
])
stacking.fit(X_train, y_train)
y_stack = stacking.predict(X_test)

In [None]:
# Results
results = pd.DataFrame({
    'Model': ['Bagging', 'Boosting', 'Stacking'],
    'Accuracy': [
        accuracy_score(y_test, y_bag),
        accuracy_score(y_test, y_boost),
        accuracy_score(y_test, y_stack)
    ]
})

print("\nBagging Report:\n", classification_report(y_test, y_bag))
print("Boosting Report:\n", classification_report(y_test, y_boost))
print("Stacking Report:\n", classification_report(y_test, y_stack))


Bagging Report:
               precision    recall  f1-score   support

         0.0       0.87      0.89      0.88       105
         1.0       0.83      0.81      0.82        74

    accuracy                           0.85       179
   macro avg       0.85      0.85      0.85       179
weighted avg       0.85      0.85      0.85       179

Boosting Report:
               precision    recall  f1-score   support

         0.0       0.83      0.88      0.85       105
         1.0       0.81      0.74      0.77        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Stacking Report:
               precision    recall  f1-score   support

         0.0       0.87      0.89      0.88       105
         1.0       0.83      0.81      0.82        74

    accuracy                           0.85       179
   macro avg       0.85      0.85      0.85       179
weighted avg       0