In [2]:
# Cell 1: Import all required libraries

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


In [3]:
# Cell 2: Load Titanic dataset (tested.csv)

df = pd.read_csv("tested.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
# Cell 3: Select model features and target variable

target = "Survived"
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

X = df[features]
y = df[target]

X.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [5]:
# Cell 4: Identify numeric and categorical columns

numeric_cols = ["Age", "SibSp", "Parch", "Fare", "Pclass"]
categorical_cols = ["Sex", "Embarked"]

numeric_cols, categorical_cols


(['Age', 'SibSp', 'Parch', 'Fare', 'Pclass'], ['Sex', 'Embarked'])

In [6]:
# Cell 5: Build preprocessing pipelines

numeric_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transform = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transform, numeric_cols),
        ("cat", categorical_transform, categorical_cols)
    ]
)

preprocess


In [7]:
# Cell 6: Split into train and test sets

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((334, 7), (84, 7))

In [8]:
# Cell 7: Create the Random Forest model inside a full pipeline

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("rf", RandomForestClassifier(
        n_estimators=300,
        max_depth=6,
        random_state=42
    ))
])

model


In [9]:
# Cell 8: Train the model

model.fit(X_train, y_train)
print("Model training completed.")


Model training completed.


In [10]:
# Cell 9: Model Evaluation

y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("Random Forest Model Accuracy:", acc)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Random Forest Model Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        53
           1       1.00      1.00      1.00        31

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [11]:
# Cell 10: Predict male vs female survival probability

# typical passenger info
hypo = {
    "Pclass": 3,
    "Age": 27,
    "SibSp": 0,
    "Parch": 0,
    "Fare": 14.4542,
    "Embarked": "S"
}

male_passenger = hypo.copy()
male_passenger["Sex"] = "male"

female_passenger = hypo.copy()
female_passenger["Sex"] = "female"

hypo_df = pd.DataFrame([male_passenger, female_passenger])

pred_proba = model.predict_proba(hypo_df)[:, 1]

print("Predicted Survival Probability:")
print(f"Male   : {pred_proba[0]:.4f}")
print(f"Female : {pred_proba[1]:.4f}")


Predicted Survival Probability:
Male   : 0.0031
Female : 0.9931


In [12]:
# Cell 11: Show feature importances (optional)

rf = model.named_steps["rf"]
ohe = model.named_steps["preprocess"].transformers_[1][1].named_steps["onehot"]

# Get encoded categorical feature names
cat_features = list(ohe.get_feature_names_out(categorical_cols))

# Full feature list
full_features = numeric_cols + cat_features

importances = pd.DataFrame({
    "feature": full_features,
    "importance": rf.feature_importances_
}).sort_values(by="importance", ascending=False)

importances


Unnamed: 0,feature,importance
6,Sex_male,0.497105
5,Sex_female,0.44069
3,Fare,0.024635
2,Parch,0.015353
0,Age,0.009051
4,Pclass,0.00502
1,SibSp,0.003423
8,Embarked_Q,0.002715
9,Embarked_S,0.001192
7,Embarked_C,0.000817
