<a href="https://colab.research.google.com/github/thanhvietnguyen/Titanic-ML-project/blob/main/src/training_evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Training

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

import joblib

In [2]:
# ==== Đọc lại và chuẩn bị dữ liệu ====
df = pd.read_csv("https://raw.githubusercontent.com/thanhvietnguyen/Titanic-ML-project/refs/heads/main/data/train.csv")
df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

X = df.drop(columns=["Survived"])
y = df["Survived"]

In [3]:
# ==== Xây lại preprocessor ====
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

numeric_features = ["Age", "Fare", "FamilySize"]
categorical_features = ["Sex", "Embarked", "Pclass"]

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])


In [4]:
# ==== Chia tập train/test ====
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# ==== Train Logistic Regression ====
from sklearn.pipeline import make_pipeline

logreg_model = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
logreg_model.fit(X_train, y_train)
y_pred_lr = logreg_model.predict(X_test)

In [6]:
# ==== Train Random Forest ====
rf_model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, random_state=42))
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Model

In [7]:
# ==== Đánh giá 2 mô hình ====
def evaluate_model(name, y_true, y_pred):
    print(f"🔍 Đánh giá mô hình: {name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("-" * 50)

evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("Random Forest", y_test, y_pred_rf)

🔍 Đánh giá mô hình: Logistic Regression
Accuracy: 0.7988826815642458
Precision: 0.7794117647058824
Recall: 0.7162162162162162
F1 Score: 0.7464788732394366
Confusion Matrix:
 [[90 15]
 [21 53]]
--------------------------------------------------
🔍 Đánh giá mô hình: Random Forest
Accuracy: 0.8100558659217877
Precision: 0.7777777777777778
Recall: 0.7567567567567568
F1 Score: 0.7671232876712328
Confusion Matrix:
 [[89 16]
 [18 56]]
--------------------------------------------------


In [20]:
# ==== Lưu mô hình tốt nhất ====
joblib.dump(rf_model, "best_model.pkl")
print("Đã lưu mô hình tốt nhất vào best_model.pkl")

Đã lưu mô hình tốt nhất vào best_model.pkl
