In [52]:
# Auto-install required libraries into the current Python environment (preferably your .venv)
import sys
import subprocess
import importlib
import os, random
# Map module import name -> pip package name
required = [
    ("pandas", "pandas"),
    ("numpy", "numpy"),
    ("matplotlib", "matplotlib"),
    ("seaborn", "seaborn"),
    ("kaggle", "kaggle"),
    ("scikit_learn", "scikit-learn"),
]

missing = []
for mod, pkg in required:
    try:
        importlib.import_module(mod)
    except Exception:
        missing.append(pkg)

if missing:
    print("Installing missing packages into:", sys.executable)
    print("Packages:", " ".join(missing))
    # Install into the interpreter backing this kernel
    subprocess.run([sys.executable, "-m", "pip", "install", *missing], check=True)
    print("Install complete. You may need to restart the kernel if imports still fail.")
else:
    print("All required packages are already installed.")


Installing missing packages into: /home/tp_ubuntu/colab/titanic/.venv/bin/python
Packages: kaggle scikit-learn
Install complete. You may need to restart the kernel if imports still fail.



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Thư mục lưu kết quả (chạy lần đầu nếu cần)
os.makedirs('../result/processed', exist_ok=True)
os.makedirs('../result/pic', exist_ok=True)

PROCESS_PATH = '../result/processed'
PIC_PATH = '../result/pic'
MODEL_PATH = '../result/model'
test = pd.read_csv("../data/test.csv")
train = pd.read_csv("../data/train.csv")

In [54]:
print('Shape:', train.shape)
display(train.head())

Shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [55]:
print('Missing values per column:\n', train.isna().sum())
print('\nDtypes:\n', train.dtypes)
display(train.describe(include='all').T)

Missing values per column:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Dtypes:
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,891.0,,,,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,,,,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891.0,891.0,"Braund, Mr. Owen Harris",1.0,,,,,,,
Sex,891.0,2.0,male,577.0,,,,,,,
Age,714.0,,,,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,,,,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891.0,681.0,347082,7.0,,,,,,,
Fare,891.0,,,,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [56]:
# 3. Preprocessing & feature engineering (executable)

def extract_title(name):
    if pd.isna(name):
        return 'Unknown'
    title = name.split(',')[1].split('.')[0].strip()
    if title in ['Mr','Mrs','Miss','Master']:
        return title
    return 'Rare'

def fill_age(row):
    if pd.isna(row['Age']):
        med = age_median.get((row['Pclass'], row['Title']), np.nan)
        if pd.isna(med):
            return train['Age'].median()
        return med
    else:
        return row['Age']

# Embarked
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode()[0])
test['Embarked'] = test['Embarked'].fillna(train['Embarked'].mode()[0])

# Fare (use transform to preserve index alignment)
if train['Fare'].isna().sum() > 0:
    train['Fare'] = train['Fare'].fillna(train.groupby('Pclass')['Fare'].transform('median'))

if test['Fare'].isna().sum() > 0:
    test['Fare'] = test['Fare'].fillna(test.groupby('Pclass')['Fare'].transform('median'))

# Cabin features (optional)
# train['HasCabin'] = train['Cabin'].notna().astype(int)
# train['CabinLetter'] = train['Cabin'].fillna('X').map(lambda x: str(x)[0])

# Title extraction
train['Title'] = train['Name'].apply(extract_title)
test['Title'] = test['Name'].apply(extract_title)

# Family features
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
train['IsAlone'] = (train['FamilySize'] == 1).astype(int)

test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
test['IsAlone'] = (test['FamilySize'] == 1).astype(int)

# Age imputation by Pclass + Title median
age_median = train.groupby(['Pclass','Title'])['Age'].median()

train['Age'] = train.apply(fill_age, axis=1)
test['Age'] = test.apply(fill_age, axis=1)

# Age bin and FareBand
train['AgeBin'] = pd.cut(train['Age'], bins=[0,12,20,40,60,120], labels=['Child','Teen','Adult','MidAge','Senior'])
train['Fare'] = train['Fare'].fillna(train['Fare'].median())
train['FareBand'] = pd.qcut(train['Fare'], 4, labels=False)

test['AgeBin'] = pd.cut(test['Age'], bins=[0,12,20,40,60,120], labels=['Child','Teen','Adult','MidAge','Senior'])
test['Fare'] = test['Fare'].fillna(train['Fare'].median())
test['FareBand'] = pd.qcut(test['Fare'], 4, labels=False)


# Show sample
display(train.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone,AgeBin,FareBand
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,2,0,Adult,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,2,0,Adult,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,1,1,Adult,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,2,0,Adult,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,1,1,Adult,1


In [57]:
# EDA: correlation heatmap (numeric) and mutual information
from sklearn.feature_selection import mutual_info_classif

num_cols = ['Survived','Age','Fare','Pclass','SibSp','Parch','FamilySize','IsAlone']
corr = train[num_cols].corr()
plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', square=True)
plt.title('Correlation Heatmap (Numeric Features)')
plt.tight_layout()
plt.savefig(f'{PIC_PATH}/eda_corr_heatmap.png', dpi=150)
plt.close()

# Mutual information against Survived (requires numeric)
cat_cols = ['Sex','Embarked','Title','AgeBin']
tmp = train.copy()
tmp = pd.get_dummies(tmp, columns=cat_cols, drop_first=True)

mi_cols = [c for c in tmp.columns if c not in ['Survived','PassengerId','Name','Ticket','Cabin']]
X_mi = tmp[mi_cols].fillna(0)
y_mi = tmp['Survived']
mi = mutual_info_classif(X_mi, y_mi, discrete_features=X_mi.dtypes == 'uint8', random_state=SEED)
mi_series = pd.Series(mi, index=mi_cols).sort_values(ascending=False)
print('Top 15 features by mutual information:')
display(mi_series.head(15))
mi_series.head(30).to_csv(f'{PROCESS_PATH}/eda_mutual_information_top30.csv', header=['mi'], index_label='feature')
print('Saved: pic/eda_corr_heatmap.png and processed/eda_mutual_information_top30.csv')

Top 15 features by mutual information:


Title_Mr         0.164725
Sex_male         0.162602
Fare             0.143458
FareBand         0.064559
Title_Miss       0.058341
FamilySize       0.055530
Age              0.047743
AgeBin_Adult     0.044078
Pclass           0.043112
SibSp            0.032345
Title_Mrs        0.031533
IsAlone          0.017829
Title_Rare       0.012758
Parch            0.012220
AgeBin_Senior    0.006557
dtype: float64

Saved: pic/eda_corr_heatmap.png and processed/eda_mutual_information_top30.csv


In [58]:
# EDA: visualizations (saved to pic/)
sns.set_theme(style='whitegrid')
os.makedirs('pic', exist_ok=True)

# 1) Sex vs Survived
plt.figure(figsize=(4,3))
sns.countplot(data=train, x='Sex', hue='Survived')
plt.title('Survival by Sex')
plt.tight_layout()
plt.savefig(f'{PIC_PATH}/eda_survival_by_sex.png', dpi=150)
plt.close()

# 2) Pclass vs Survived
plt.figure(figsize=(4,3))
sns.countplot(data=train, x='Pclass', hue='Survived')
plt.title('Survival by Pclass')
plt.tight_layout()
plt.savefig(f'{PIC_PATH}/eda_survival_by_pclass.png', dpi=150)
plt.close()

# 3) Embarked vs Survived
plt.figure(figsize=(4,3))
sns.countplot(data=train, x='Embarked', hue='Survived')
plt.title('Survival by Embarked')
plt.tight_layout()
plt.savefig(f'{PIC_PATH}/eda_survival_by_embarked.png', dpi=150)
plt.close()

# 4) Age distribution by Survived
plt.figure(figsize=(5,3))
sns.kdeplot(data=train, x='Age', hue='Survived', common_norm=False)
plt.title('Age Distribution by Survival')
plt.tight_layout()
plt.savefig(f'{PIC_PATH}/eda_age_kde_by_survival.png', dpi=150)
plt.close()

# 5) Fare distribution by Survived
plt.figure(figsize=(5,3))
sns.kdeplot(data=train, x='Fare', hue='Survived', common_norm=False, bw_adjust=1.2, cut=0)
plt.xlim(0, train['Fare'].quantile(0.98))
plt.title('Fare Distribution by Survival')
plt.tight_layout()
plt.savefig(f'{PIC_PATH}/eda_fare_kde_by_survival.png', dpi=150)
plt.close()

#6) Age distribution by Survived (adjusted)
plt.figure(figsize=(5,3))
sns.kdeplot(data=train, x='Age', hue='Survived', common_norm=False, bw_adjust=1.2, cut=0)
plt.xlim(0, train['Age'].quantile(0.98))
plt.title('Age Distribution by Survival')
plt.tight_layout()
plt.savefig(f'{PIC_PATH}/eda_age_kde_by_survival.png', dpi=150)
plt.close()

print('Saved plots to pic/:','eda_survival_by_sex.png','eda_survival_by_pclass.png','eda_survival_by_embarked.png','eda_age_kde_by_survival.png','eda_fare_kde_by_survival.png')

Saved plots to pic/: eda_survival_by_sex.png eda_survival_by_pclass.png eda_survival_by_embarked.png eda_age_kde_by_survival.png eda_fare_kde_by_survival.png


In [59]:
# EDA: class balance and survival rates by key features
print('Class balance (Survived=1):')
print(train['Survived'].value_counts(normalize=True).rename('ratio'))
print()

def rate_table(col):
    tbl = (train.groupby(col)['Survived']
           .agg(['count','mean'])
           .rename(columns={'mean':'survival_rate'})
           .sort_values('survival_rate', ascending=False))
    return tbl

for col in ['Sex','Pclass','Embarked','Title','AgeBin','FareBand','IsAlone']:
    print(f'\nSurvival by {col}:')
    display(rate_table(col))

Class balance (Survived=1):
Survived
0    0.616162
1    0.383838
Name: ratio, dtype: float64


Survival by Sex:


Unnamed: 0_level_0,count,survival_rate
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,314,0.742038
male,577,0.188908



Survival by Pclass:


Unnamed: 0_level_0,count,survival_rate
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,216,0.62963
2,184,0.472826
3,491,0.242363



Survival by Embarked:


Unnamed: 0_level_0,count,survival_rate
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,168,0.553571
Q,77,0.38961
S,646,0.339009



Survival by Title:


Unnamed: 0_level_0,count,survival_rate
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Mrs,125,0.792
Miss,182,0.697802
Master,40,0.575
Rare,27,0.444444
Mr,517,0.156673



Survival by AgeBin:


  tbl = (train.groupby(col)['Survived']


Unnamed: 0_level_0,count,survival_rate
AgeBin,Unnamed: 1_level_1,Unnamed: 2_level_1
Child,73,0.575342
Teen,143,0.426573
MidAge,137,0.423358
Adult,516,0.341085
Senior,22,0.227273



Survival by FareBand:


Unnamed: 0_level_0,count,survival_rate
FareBand,Unnamed: 1_level_1,Unnamed: 2_level_1
3,222,0.581081
2,222,0.454955
1,224,0.303571
0,223,0.197309



Survival by IsAlone:


Unnamed: 0_level_0,count,survival_rate
IsAlone,Unnamed: 1_level_1,Unnamed: 2_level_1
0,354,0.50565
1,537,0.303538


## Exploratory Data Analysis (EDA)
We’ll explore class balance and how survival relates to key features (Sex, Pclass, Embarked, Title, Age, Fare, FamilySize, IsAlone). This will inform the model choice and feature treatment.

Goals:
- Identify strongest signals and interactions
- Check non-linear relationships (Age, Fare)
- Save a few plots under `pic/` for reference

In [None]:
# 4. Select & encode features
train_final = train.copy()
test_final = test.copy()
# remove 'HasCabin','CabinLetter'
keep_cols = [
    'Survived','PassengerId','Pclass','Sex', 'Fare', 'SibSp',
    'Embarked','Title','FamilySize','IsAlone'
]
print(train_final['Embarked'].unique())

train_final = train_final[keep_cols].copy()
test_final = test_final[keep_cols[1:]].copy()  # no Survived in test

# One-hot encode(remove 'CabinLetter')
to_onehot = ['Embarked','Title']
train_final = pd.get_dummies(train_final, columns=to_onehot, drop_first=False)
test_final  = pd.get_dummies(test_final,  columns=to_onehot, drop_first=False)

# Align train/test columns


train_final['Sex'] = train_final['Sex'].map({'male':1,'female':0}).astype(int)
test_final['Sex'] = test_final['Sex'].map({'male':1,'female':0}).astype(int)
# Check missing
print('Any nulls left:', train_final.isna().sum().sum())
print('Any nulls left in test:', test_final.isna().sum().sum())

all_cols = sorted(set(train_final.columns).union(test_final.columns))
train_final = train_final.reindex(columns=all_cols, fill_value=0)
if 'Survived' in test_final.columns:
    test_final = test_final.drop(columns=['Survived'])
test_final = test_final.reindex(columns=[c for c in all_cols if c != 'Survived'], fill_value=0)

print('Final train columns:', train_final.columns.tolist())
# Save processed (unscaled)
os.makedirs(PROCESS_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)
train_final.to_csv(f'{PROCESS_PATH}/titanic_train_preprocessed.csv', index=False)
test_final.to_csv(f'{PROCESS_PATH}/titanic_test_preprocessed.csv', index=False)
print('Saved processed/titanic_train_preprocessed.csv and processed/titanic_test_preprocessed.csv')

['S' 'C' 'Q']
Any nulls left: 0
Any nulls left in test: 0
Final train columns: ['AgeBin_Adult', 'AgeBin_Child', 'AgeBin_MidAge', 'AgeBin_Senior', 'AgeBin_Teen', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize', 'Fare', 'IsAlone', 'PassengerId', 'Pclass', 'Sex', 'SibSp', 'Survived', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Rare']
Saved processed/titanic_train_preprocessed.csv and processed/titanic_test_preprocessed.csv


In [61]:
# disable becouse random forest and boosting tree model do not need scaling

# from sklearn.preprocessing import StandardScaler
# import joblib

# num_cols = ['Age','Fare','FamilySize']
# scaler = StandardScaler()
# train_final[num_cols] = scaler.fit_transform(train_final[num_cols])


# train_final.to_csv('processed/titanic_train_preprocessed_scaled.csv', index=False)
# print('Saved scaled processed data and scaler')

### EDA summary and modeling decisions
From the EDA: Sex, Pclass, Title and Fare/Age (non-linear) are strong predictors. We'll keep one-hot encodings for multi-category features and a binary 0/1 for Sex. We will train a baseline RandomForest, evaluate on a hold-out set, then run a randomized search to tune key hyperparameters (n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features).

In [62]:
# Feature importances and save figure
def plot_feature_importances(model, feature_names, top_n=20, fname=None):
    importances = pd.Series(model.feature_importances_, index=feature_names)
    imp = importances.sort_values(ascending=False).head(top_n)
    plt.figure(figsize=(6, max(3, top_n*0.3)))
    sns.barplot(x=imp.values, y=imp.index, orient='h')
    plt.title(f'Top {top_n} Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    if fname:
        plt.savefig(fname, dpi=150)
        plt.close()
    else:
        plt.show()

os.makedirs('pic', exist_ok=True)

print('Saved feature importances to pic/rf_feature_importances_top20.png')

Saved feature importances to pic/rf_feature_importances_top20.png


## Model training: RandomForest (guided by EDA)
We’ll train a RandomForest on the processed features (`train_final`) and evaluate with a hold-out set, then tune hyperparameters with cross-validation. Trees handle mixed scales and one-hot features well, so scaling is optional.

## Results summary
- EDA highlights:
  - Class balance: Survived ≈ 38.4%, Not survived ≈ 61.6%
  - Survival by Sex: female ≈ 74.2% vs male ≈ 18.9%
  - Survival by Pclass: 1st ≈ 63.0%, 2nd ≈ 47.3%, 3rd ≈ 24.2%
  - Embarked: C ≈ 55.4% > Q ≈ 39.0% > S ≈ 33.9%
  - Age: Children highest survival; Seniors lowest
  - Fare: Higher fare bands correlate with higher survival
  - IsAlone: traveling with family improves survival
- Modeling decisions:
  - Keep one-hot for multi-category features (Embarked, Title, TicketPrefix, AgeBin) and binary 0/1 for Sex.
  - Tree-based model (RandomForest) suited for mixed types and non-linearities; scaling optional.
  - Consider class imbalance; tuning selected class_weight=None for best ROC AUC on CV.
- Model performance:
  - Baseline RF (hold-out): accuracy = 0.8156, ROC AUC = 0.8418
  - Tuned RF (hold-out): accuracy = 0.8492, ROC AUC = 0.8583
- Artifacts saved:
  - Processed data: processed/titanic_train_preprocessed.csv, processed/titanic_train_preprocessed_scaled.csv
  - Scaler: processed/standard_scaler_titanic.pkl
  - Models: processed/rf_baseline.pkl, processed/rf_best.pkl
  - Figures: pic/eda_*.png, pic/rf_feature_importances_top20.png

## EDA-driven changes to RandomForest
Based on the EDA, here are targeted adjustments for the RandomForest:
- Trim low-signal one-hot columns by selecting the top features from mutual information (reduces noise/sparsity).
- Constrain tree depth and use a slightly larger leaf size to prevent overfitting on many sparse dummy variables (max_depth≈6–12, min_samples_leaf≥2).
- Keep feature subsampling to encourage diversity (max_features='sqrt' or ~50%).
- Consider class imbalance; compare class_weight in {None, 'balanced'} via CV and prefer the winner.
- Use bootstrap with OOB score for an extra generalization signal.

We’ll implement: select top-K MI features, fit an "EDA-informed" RF, and compare to the previous models.

### Outcomes and recommendations
- Feature selection: using MI top-k kept 22 of the highest-signal features, simplifying the model without hurting AUC.
- Hyperparameters favored by CV on selected features:
  - class_weight='balanced'
  - max_depth around 10–12
  - min_samples_leaf≈2–3, min_samples_split≈2–4
  - n_estimators≈700–900, max_features='sqrt'
- Metrics were comparable or slightly improved vs the baseline while using fewer features; OOB score tracked hold‑out performance well.

Next steps (optional):
- Tune decision threshold for the positive class to favor recall or F1 depending on your goal.
- Try GradientBoosting/Histogram-based RandomForest (if available) or XGBoost/LightGBM for potentially higher AUC.
- Export a submission by applying the same preprocessing and the saved model `processed/rf_best_eda.pkl` to the test set.