In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import joblib

In [None]:
# STEP 1: LOAD FEATURES
print("Loading feature files...")
application_train = pd.read_csv('/content/drive/My Drive/group6_Final_DP/data/raw_data/application_train.csv')
application_test = pd.read_csv('/content/drive/My Drive/group6_Final_DP/data/raw_data/application_test.csv')
bureau_features = pd.read_csv('/content/drive/My Drive/group6_Final_DP/data/processed_data/bureau_feature.csv')
credit_card_features = pd.read_csv('/content/drive/My Drive/group6_Final_DP/data/processed_data/ccb_feature.csv')
installments_features = pd.read_csv('/content/drive/My Drive/group6_Final_DP/data/processed_data/ip_feature.csv')
pos_cash_features = pd.read_csv('/content/drive/My Drive/group6_Final_DP/data/processed_data/pcb_feature.csv')
previous_application_features = pd.read_csv('/content/drive/My Drive/group6_Final_DP/data/processed_data/pa_feature.csv')

# STEP 2: MERGE FEATURES
print("Merging features...")
# Merge train data
data_train = application_train.merge(bureau_features, on='SK_ID_CURR', how='left') \
                              .merge(credit_card_features, on='SK_ID_CURR', how='left') \
                              .merge(installments_features, on='SK_ID_CURR', how='left') \
                              .merge(pos_cash_features, on='SK_ID_CURR', how='left') \
                              .merge(previous_application_features, on='SK_ID_CURR', how='left')

# Merge test data
data_test = application_test.merge(bureau_features, on='SK_ID_CURR', how='left') \
                            .merge(credit_card_features, on='SK_ID_CURR', how='left') \
                            .merge(installments_features, on='SK_ID_CURR', how='left') \
                            .merge(pos_cash_features, on='SK_ID_CURR', how='left') \
                            .merge(previous_application_features, on='SK_ID_CURR', how='left')


Loading feature files...
Merging features...


In [None]:
# STEP 3: IMPUTE MISSING VALUES
print("Imputing missing values...")

# Phân loại các cột thành dạng số và không phải số
numeric_cols = data_train.select_dtypes(include=[np.number]).columns.drop('TARGET', errors='ignore')  # Loại bỏ TARGET
non_numeric_cols = data_train.select_dtypes(exclude=[np.number]).columns

# Xử lý dữ liệu số: Điền giá trị thiếu bằng median
data_train_numeric = pd.DataFrame(
    SimpleImputer(strategy='median').fit_transform(data_train[numeric_cols]),
    columns=numeric_cols
)

data_test_numeric = pd.DataFrame(
    SimpleImputer(strategy='median').fit_transform(data_test[numeric_cols]),
    columns=numeric_cols
)

# Xử lý dữ liệu không phải số: Điền giá trị thiếu bằng 'missing' (hoặc mode nếu cần)
data_train_non_numeric = pd.DataFrame(
    SimpleImputer(strategy='most_frequent').fit_transform(data_train[non_numeric_cols]),
    columns=non_numeric_cols
)

data_test_non_numeric = pd.DataFrame(
    SimpleImputer(strategy='most_frequent').fit_transform(data_test[non_numeric_cols]),
    columns=non_numeric_cols
)

# Gộp lại dữ liệu đã xử lý
data_train = pd.concat([data_train_numeric, data_train_non_numeric, data_train[['TARGET']]], axis=1)  # Thêm lại TARGET vào train
data_test = pd.concat([data_test_numeric, data_test_non_numeric], axis=1)  # Test không có TARGET


Imputing missing values...


In [None]:
# STEP 4: FEATURE SELECTION
print("Selecting important features...")

# Loại bỏ cột không phải số trong X
X = data_train.drop(columns=['TARGET', 'SK_ID_CURR'])  # Loại bỏ cột TARGET và SK_ID_CURR
y = data_train['TARGET']

# Chuyển đổi cột không phải số thành số (dùng one-hot encoding)
X = pd.get_dummies(X, drop_first=True)
X_test = pd.get_dummies(data_test.drop(columns=['SK_ID_CURR']), drop_first=True)

# Đồng bộ hóa cột của tập test với tập train
missing_cols_test = set(X.columns) - set(X_test.columns)
for col in missing_cols_test:
    X_test[col] = 0

missing_cols_train = set(X_test.columns) - set(X.columns)
for col in missing_cols_train:
    X[col] = 0

# Đồng bộ hóa thứ tự cột giữa train và test
X_test = X_test[X.columns]

# Sử dụng SelectKBest để chọn top 50 đặc trưng quan trọng nhất
selector = SelectKBest(score_func=f_regression, k=50)
X_selected = selector.fit_transform(X, y)

# Lấy tên các đặc trưng đã chọn
selected_features = X.columns[selector.get_support()]
X = pd.DataFrame(X_selected, columns=selected_features)

# Áp dụng bộ lọc tương tự cho tập test
X_test = X_test[selected_features]

# STEP 5: SCALING DATA
print("Scaling features...")
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

Selecting important features...
Scaling features...


In [None]:
# STEP 6: TRAIN AND STACK
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
print("Training base models...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)

# Stacking Classifier
print("Training stacking classifier...")
stacking_classifier = StackingClassifier(
    estimators=[('lr', lr_model), ('dt', dt_model)],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42)
)
stacking_classifier.fit(X_train, y_train)

# Validation prediction
y_val_pred = stacking_classifier.predict(X_val)

# Evaluate model
print("Accuracy on validation set:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Training base models...
Training stacking classifier...
Accuracy on validation set: 0.9176862729157351

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.99      0.96     45215
           1       0.43      0.05      0.09      3987

    accuracy                           0.92     49202
   macro avg       0.68      0.52      0.52     49202
weighted avg       0.88      0.92      0.89     49202



In [None]:
# STEP 7: EVALUATE MODEL

def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 30)
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

evaluate_model(y_val, y_val_pred, "Stacking Classifier")


Model: Stacking Classifier
Accuracy: 0.9177
Precision: 0.8822
Recall: 0.9177
F1 Score: 0.8863
------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     45215
           1       0.43      0.05      0.09      3987

    accuracy                           0.92     49202
   macro avg       0.68      0.52      0.52     49202
weighted avg       0.88      0.92      0.89     49202



In [None]:
# STEP 8: PREDICT AND SAVE SUBMISSION
print("Generating predictions for test set...")

# Dự đoán xác suất cho lớp dương (lớp 1)
y_test_proba = stacking_classifier.predict_proba(X_test)[:, 1]

# Tạo file submission
submission = pd.DataFrame({
    'SK_ID_CURR': application_test['SK_ID_CURR'],
    'TARGET': y_test_proba
})
submission.to_csv('/content/drive/My Drive/group6_Final_DP/data/output/submission.csv', index=False)
print("Submission file created: submission.csv")


Generating predictions for test set...
Submission file created: submission.csv
