In [2]:
# ============================
# 00. Import thư viện
# ============================
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ============================
# 01. Load file processed (giữ numeric gốc, đã one-hot)
# ============================
path = "../data/processed/BankChurners_processed.csv"

rows = []
with open(path, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        # Convert tất cả sang float
        rows.append([float(x) for x in row])

data = np.array(rows)
X = data[:, :-1]
y = data[:, -1].astype(int)

print("Shape X:", X.shape)
print("Shape y:", y.shape)

Shape X: (10127, 34)
Shape y: (10127,)


In [3]:
# ============================
# 02. Train/Test Split
# ============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
# ============================
# 03. Chuẩn hóa cho Logistic Regression
# ============================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# ============================
# 04. Logistic Regression
# ============================
log_model = LogisticRegression(max_iter=2000)
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)

print("\n===== Logistic Regression =====")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


===== Logistic Regression =====
Accuracy: 0.8998025666337611
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      1701
           1       0.77      0.54      0.63       325

    accuracy                           0.90      2026
   macro avg       0.84      0.75      0.79      2026
weighted avg       0.89      0.90      0.89      2026

Confusion Matrix:
 [[1648   53]
 [ 150  175]]


In [6]:
# ============================
# 05. Random Forest (không cần scale)
# ============================
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\n===== Random Forest =====")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


===== Random Forest =====
Accuracy: 0.9545903257650543
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1701
           1       0.94      0.77      0.84       325

    accuracy                           0.95      2026
   macro avg       0.95      0.88      0.91      2026
weighted avg       0.95      0.95      0.95      2026

Confusion Matrix:
 [[1685   16]
 [  76  249]]


In [7]:
# ============================
# 06. Phân bố label
# ============================
unique, counts = np.unique(y, return_counts=True)
print("\nFull dataset label distribution:")
for u, c in zip(unique, counts):
    print(u, c)

unique, counts = np.unique(y_train, return_counts=True)
print("\nTrain distribution:")
for u, c in zip(unique, counts):
    print(u, c)


Full dataset label distribution:
0 8500
1 1627

Train distribution:
0 6799
1 1302
