In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.svm import SVC

In [11]:
# !conda install imbalanced-learn
import pickle

test_data

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,15000,130.0,D-penicillamine,16944.0,F,Y,Y,Y,N,17.4,,3.00,182.0,559.0,119.35,,401.0,11.0,4.0
1,15001,2574.0,D-penicillamine,17664.0,F,N,Y,N,N,0.9,242.0,3.65,108.0,1040.0,108.50,118.0,344.0,11.0,3.0
2,15002,3853.0,Placebo,13736.0,F,N,N,Y,N,1.6,354.0,3.80,44.0,1584.0,111.60,108.0,277.0,10.3,4.0
3,15003,2249.0,,23011.0,F,,,,N,0.9,,3.06,,,,,190.0,11.5,4.0
4,15004,1150.0,,17046.0,F,,,,N,0.7,,3.66,,,,,350.0,10.3,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,24995,2713.0,,17532.0,F,,,,N,1.1,,3.75,,,,,330.0,9.9,3.0
9996,24996,2580.0,D-penicillamine,25569.0,F,N,N,N,N,0.4,,4.01,20.0,666.0,54.25,,277.0,10.0,3.0
9997,24997,186.0,Placebo,21483.0,F,N,Y,Y,S,6.6,1000.0,3.50,188.0,944.0,130.20,133.0,265.0,11.0,4.0
9998,24998,2221.0,Placebo,16728.0,F,N,Y,N,N,0.9,434.0,3.36,161.0,1523.0,117.80,166.0,381.0,9.9,2.0


In [31]:
# Load datasets

# #only given train_data
# X_simple_xgb=np.save('X_train_simple_xgb.npy',X_simple_xgb)
# train_label=np.save('X_train_label.npy',train_label)

#pseudo label 0.8 ver train data

X_simple_xgb = np.load('X_pseudo_simple_xgb.npy')
train_label=np.load("pseudo_train_label.npy")

test_data = pd.read_csv('test.csv')
test_id=test_data['id']
X_test_simple_xgb=np.load("X_test_simple_xgb.npy")

# # SMOTE for handling class imbalance
# smote = SMOTE()
# X_resampled, y_resampled = smote.fit_resample(X_simple_xgb, train_label)

#smote 없이 하기!
X_resampled=X_simple_xgb
y_resampled=train_label

dtrain = xgb.DMatrix(X_resampled, label=y_resampled)
dtest = xgb.DMatrix(X_test_simple_xgb)


# Model definitions
xgb_model = xgb.XGBClassifier(objective='multi:softprob', num_class=3, tree_method='hist', device='cuda', random_state=42)
rf = RandomForestClassifier()
cat_model = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=6, loss_function='MultiClass', random_seed=42, verbose=200)
lgb_model = lgb.LGBMClassifier(objective='multiclass', num_class=3, random_state=42)

# Load the best params from the pickle file
with open("best_params.pkl", "rb") as file:
    best_params = pickle.load(file)

# You can now use the loaded parameters
print(best_params)

best_params = best_params

# Ensemble model using stacking
estimators = [
    ('xgb', xgb.XGBClassifier(**best_params, objective='multi:softprob', num_class=3, tree_method='hist', device='cuda', random_state=42)),
    ('catboost', cat_model),
    ('lgb', lgb_model)
]
stack_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=10000))

# Cross-validation
stack_scores = cross_val_score(stack_model, X_resampled, y_resampled, cv=5, scoring='accuracy')
print(f'Stacked model accuracy: {stack_scores.mean()}') #no Pseudo 이용시

#Stacked model accuracy: 0.9379823641166685 이거 나옴..처음에 -->random search는 내가 초반에 이미함. (using pseudo)

# Train the stacked model and predict
stack_model.fit(X_resampled, y_resampled)
test_preds = stack_model.predict_proba(X_test_simple_xgb)
print(f"Stacking Model Log Loss: {log_loss_stack}")

# Export submission
test_id = test_data['id']
output_df = pd.DataFrame(test_preds, columns=['Status_C', 'Status_CL', 'Status_D'])
output_df.insert(0, 'id', test_id)
output_df.to_csv('final_submission_no_smote_ensemble.csv', index=False)

# CatBoost and LightGBM for additional predictions
# cat_model.fit(X_resampled, y_resampled)
# lgb_model.fit(X_resampled, y_resampled)

# # Predict using CatBoost and LightGBM
# cat_preds = cat_model.predict_proba(X_test_simple_xgb)
# lgb_preds = lgb_model.predict_proba(X_test_simple_xgb)

# # Export CatBoost predictions
# output_cat = pd.DataFrame(cat_preds, columns=['Status_C', 'Status_CL', 'Status_D'])
# output_cat.insert(0, 'id', test_id)
# output_cat.to_csv('final_submission_catboost.csv', index=False)

# # Export LightGBM predictions
# output_lgb = pd.DataFrame(lgb_preds, columns=['Status_C', 'Status_CL', 'Status_D'])
# output_lgb.insert(0, 'id', test_id)
# output_lgb.to_csv('final_submission_lightgbm.csv', index=False)


{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 1, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 0.8}
0:	learn: 1.0274453	total: 87.4ms	remaining: 1m 27s
200:	learn: 0.2422674	total: 3.55s	remaining: 14.1s
400:	learn: 0.2182612	total: 6.86s	remaining: 10.3s
600:	learn: 0.2013990	total: 10.6s	remaining: 7.04s
800:	learn: 0.1873033	total: 14.5s	remaining: 3.6s
999:	learn: 0.1749912	total: 17.6s	remaining: 0us
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2318
[LightGBM] [Info] Number of data points in the train set: 17607, number of used features: 18
[LightGBM] [Info] Start training from score -0.362444
[LightGBM] [Info] Start training from score -4.123563
[LightGBM] [Info] Start training from score -1.245350
0:	learn: 1.0252476	total: 14ms	remaining: 13.9s
200:	learn: 0.215542

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000981 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2315
[LightGBM] [Info] Number of data points in the train set: 14086, number of used features: 18
[LightGBM] [Info] Start training from score -0.362493
[LightGBM] [Info] Start training from score -4.123591
[LightGBM] [Info] Start training from score -1.245231
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000904 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2314
[LightGBM] [Info] Number of data points in the train set: 14086, number of used features: 18
[LightGBM] [Info] Start training from score -0.362391
[LightGBM] [Info] Start training from score -4.123591
[LightGBM] [Info] Start 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000978 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2314
[LightGBM] [Info] Number of data points in the train set: 14085, number of used features: 18
[LightGBM] [Info] Start training from score -0.362320
[LightGBM] [Info] Start training from score -4.127916
[LightGBM] [Info] Start training from score -1.245406
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2316
[LightGBM] [Info] Number of data points in the train set: 14086, number of used features: 18
[LightGBM] [Info] Start training from score -0.362391
[LightGBM] [Info] Start training from score -4.123591
[LightGBM] [Info] Start training from score -1.245477
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003809 sec

400:	learn: 0.2550011	total: 6.41s	remaining: 9.57s
600:	learn: 0.2362081	total: 9.65s	remaining: 6.4s
800:	learn: 0.2201670	total: 13.1s	remaining: 3.25s
999:	learn: 0.2066961	total: 16.4s	remaining: 0us
0:	learn: 1.0340089	total: 12.9ms	remaining: 12.9s
200:	learn: 0.3063576	total: 3.29s	remaining: 13.1s
400:	learn: 0.2788275	total: 6.58s	remaining: 9.82s
600:	learn: 0.2585538	total: 9.85s	remaining: 6.54s
800:	learn: 0.2423294	total: 13.1s	remaining: 3.25s
999:	learn: 0.2279014	total: 16.5s	remaining: 0us
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2318
[LightGBM] [Info] Number of data points in the train set: 17607, number of used features: 18
[LightGBM] [Info] Start training from score -0.362444
[LightGBM] [Info] Start training from score -4.123563
[LightGBM] [Info] Start training from score -1.245350
[LightGBM] [Info] Auto-choosing col

In [18]:
# 방법 2:

# #only given train_data
# X_simple_xgb=np.save('X_train_simple_xgb.npy',X_simple_xgb)
# train_label=np.save('X_train_label.npy',train_label)

#pseudo label 0.8 ver train data

X_simple_xgb = np.load('X_pseudo_simple_xgb.npy')
train_label=np.load("pseudo_train_label.npy")


X_test_simple_xgb = np.load('X_test_simple_xgb.npy')

# SMOTE for handling class imbalance
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_simple_xgb, train_label)

#put gpu
dtrain = xgb.DMatrix(X_resampled, label=y_resampled)
dtest = xgb.DMatrix(X_test_simple_xgb)

# Hyperparameter Optimization for XGBoost


# Support Vector Machine (SVM)
svm_model = SVC(probability=True, kernel='rbf', C=1, gamma='scale')

# CatBoost Model
cat_model = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=6, loss_function='MultiClass', random_seed=42, verbose=200)

# LightGBM Model
lgb_model = lgb.LGBMClassifier(objective='multiclass', num_class=3, random_state=42)

# Ensemble with Meta-Learning (Stacking) Including SVM
estimators = [
    ('xgb', xgb.XGBClassifier(**best_params, objective='multi:softprob', num_class=3, tree_method='hist', device='cuda', random_state=42)),
    ('catboost', cat_model),
    ('lgb', lgb_model),
    ('svm', svm_model)
]

# Final Estimator with Logistic Regression (Meta-Learning)
stack_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=10000))

# Cross-validation for Log Loss
stack_scores = cross_val_score(stack_model, X_resampled, y_resampled, cv=5, scoring='neg_log_loss')
print(f'Stacked model log loss: {stack_scores.mean()}')


# Train the final stacking model
stack_model.fit(X_resampled, y_resampled)

# Predict with the Stacking Model
test_preds = stack_model.predict_proba(X_test_simple_xgb)

# Export submission
test_id = pd.read_csv('test.csv')['id']
output_df = pd.DataFrame(test_preds, columns=['Status_C', 'Status_CL', 'Status_D'])
output_df.insert(0, 'id', test_id)
output_df.to_csv('final_submission_ensemble_with_svm.csv', index=False)

# Log Loss for different models
log_loss_stack = log_loss(y_resampled, stack_model.predict_proba(X_resampled))
# log_loss_xgb = log_loss(y_resampled, random_search_xgb.predict_proba(X_resampled))
log_loss_catboost = log_loss(y_resampled, cat_model.predict_proba(X_resampled))
log_loss_lgb = log_loss(y_resampled, lgb_model.predict_proba(X_resampled))

print(f"Stacking Model Log Loss: {log_loss_stack}")
# print(f"XGBoost Model Log Loss: {log_loss_xgb}")
print(f"CatBoost Model Log Loss: {log_loss_catboost}")
print(f"LightGBM Model Log Loss: {log_loss_lgb}")


0:	learn: 1.0509979	total: 32.7ms	remaining: 32.7s
200:	learn: 0.2908337	total: 6.83s	remaining: 27.2s
400:	learn: 0.2132922	total: 13.4s	remaining: 20.1s
600:	learn: 0.1732974	total: 19.7s	remaining: 13.1s
800:	learn: 0.1482512	total: 25.6s	remaining: 6.37s
999:	learn: 0.1311558	total: 31.5s	remaining: 0us
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 36763, number of used features: 18
[LightGBM] [Info] Start training from score -1.098639
[LightGBM] [Info] Start training from score -1.098558
[LightGBM] [Info] Start training from score -1.098639
0:	learn: 1.0517277	total: 95.2ms	remaining: 1m 35s
200:	learn: 0.2739379	total: 5.97s	remaining: 23.7s
400:	learn: 0.1889324	total: 12s	remaining: 18s
600:	learn: 0.1450955	total: 18.3s	remaining: 12.1s
800:	learn: 0.1186649	total: 25.2s	r

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4589
[LightGBM] [Info] Number of data points in the train set: 29411, number of used features: 18
[LightGBM] [Info] Start training from score -1.098680
[LightGBM] [Info] Start training from score -1.098578
[LightGBM] [Info] Start training from score -1.098578
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4588
[LightGBM] [Info] Number of data points in the train set: 29411, number of used features: 18
[LightGBM] [Info] Start training from score -1.098578
[LightGBM] [Info] Start training from score -1.098680
[LightGBM] [Info] Start training from score -1.098578
0:	learn: 1.0518812	total: 20.3ms	remaining: 20.3s
200:	learn: 0.3301621	total: 6.01s	remaining: 2

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4589
[LightGBM] [Info] Number of data points in the train set: 29410, number of used features: 18
[LightGBM] [Info] Start training from score -1.098544
[LightGBM] [Info] Start training from score -1.098646
[LightGBM] [Info] Start training from score -1.098646
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4589
[LightGBM] [Info] Number of data points in the train set: 29410, number of used features: 18
[LightGBM] [Info] Start training from score -1.098544
[LightGBM] [Info] Start training from score -1.098646
[LightGBM] [Info] Start training from score -1.098646
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012084 sec

800:	learn: 0.1963424	total: 23.5s	remaining: 5.83s
999:	learn: 0.1782656	total: 29.1s	remaining: 0us
0:	learn: 1.0528252	total: 44.4ms	remaining: 44.4s
200:	learn: 0.3390919	total: 5.81s	remaining: 23.1s
400:	learn: 0.2647243	total: 11.2s	remaining: 16.8s
600:	learn: 0.2285576	total: 16.9s	remaining: 11.2s
800:	learn: 0.2047301	total: 22.5s	remaining: 5.6s
999:	learn: 0.1873764	total: 28.2s	remaining: 0us
0:	learn: 1.0539266	total: 39.1ms	remaining: 39s
200:	learn: 0.3456762	total: 5.98s	remaining: 23.8s
400:	learn: 0.2724805	total: 11.7s	remaining: 17.5s
600:	learn: 0.2353168	total: 17.7s	remaining: 11.7s
800:	learn: 0.2103557	total: 23.1s	remaining: 5.73s
999:	learn: 0.1930034	total: 28.8s	remaining: 0us
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Numbe

NameError: name 'random_search_xgb' is not defined

In [32]:
train_score = stack_model.score(X_simple_xgb, train_label) #- SVM 추가 시 결과!
print(train_score) #0.9239856422372665 (svm추가, pseudo data, smote 이용시)
# no svm, pseudo data , smote 이용 x 0.9186241991912399

0.9186241991912399


In [26]:
#save svm+ 모델

# Save the trained model
# with open('final_svm_ensemble_stack_model.pkl', 'wb') as f:
#     pickle.dump(stack_model, f)
# print("Model saved successfully.")

#load
# Load the saved model
with open('final_svm_ensemble_stack_model.pkl', 'rb') as f:
    loaded_stack_model = pickle.load(f)
print("Model loaded successfully.")


Model saved successfully.
Model loaded successfully.


In [30]:
cat_model
lgb_model
stack_model


In [29]:
# Log Loss for different models
log_loss_stack = log_loss(y_resampled, stack_model.predict_proba(X_resampled))
# log_loss_xgb = log_loss(y_resampled, random_search_xgb.predict_proba(X_resampled))
# log_loss_catboost = log_loss(y_resampled, cat_model.predict_proba(X_resampled))
# log_loss_lgb = log_loss(y_resampled, lgb_model.predict_proba(X_resampled))

print(f"Stacking Model Log Loss: {log_loss_stack}") #--> #Stacking Model Log Loss: 0.1282131042831433 (svm꺼)
#final tuning 이후 확인 필요
# print(f"CatBoost Model Log Loss: {log_loss_catboost}")
# print(f"LightGBM Model Log Loss: {log_loss_lgb}")

Stacking Model Log Loss: 0.1282131042831433


In [27]:
#find best parameter for stacked model.

# Base classifiers (your previous models: RandomForest, SVM, XGBoost, DecisionTree)
# Assuming `rf`, `svm`, `xg`, and `dt` are already defined and hyperparameters are optimized
# Example:

# 
#기존 pre-trained 된 모델들 불러오기!

# # Create the StackingClassifier with the base classifiers
# estimator_list = [
#     ('svm', svm),
#     ('xgb', xgb_model),
#     ('catboost', cat_model),
#     ('lgb',lgb_model)
    
# ]

# stack_model = StackingClassifier(
#     estimators=estimator_list,
#     final_estimator=LogisticRegression(max_iter=100000)
# )

# Define hyperparameter grid for the final estimator (Logistic Regression)
final_estimator_params = {
    'final_estimator__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'final_estimator__C': np.logspace(-4, 4, 20),
    'final_estimator__solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
    'final_estimator__max_iter': [50, 100, 250, 500]
}

# Perform GridSearchCV to optimize the final estimator in the stacking model
final_estimator_search = GridSearchCV(
    stack_model, #기존의 stack model의 base model은 그대로 사용!ㅏ
    
    param_grid=final_estimator_params,
    cv=5,
    return_train_score=False,
    n_jobs=-1
)

# Fit the model
final_estimator_search.fit(X_resampled, y_resampled)

# Print the results for the final estimator
print({
    'model': 'logistic_regression',
    'best_score': final_estimator_search.best_score_,
    'best_params': final_estimator_search.best_params_
})

# Predict with the best stacking model
test_preds = final_estimator_search.best_estimator_.predict_proba(X_test_simple_xgb)

# Export the results
test_id = test_data['id']
output_df = pd.DataFrame(test_preds, columns=['Status_C', 'Status_CL', 'Status_D'])
output_df.insert(0, 'id', test_id)
output_df.to_csv('final_best_param_submission_stacking.csv', index=False)

KeyboardInterrupt: 