# Library

In [None]:
# 구글 드라이브 연결
from google.colab import drive
drive.mount('/content/drive')

# 라이브러리 설치
!pip install --upgrade --no-cache-dir numpy seaborn
!pip install ydata_profiling
!pip install missingno
!pip install tqdm

!pip install -U kss==5.2.0
!pip install kiwipiepy
!pip install soynlp
!pip install keybert
!pip install keybert[gensim]
!pip install sentence_transformers

!pip install nltk
!pip install konlpy
!pip install gensim
!pip install bertopic -U
!pip install bertopic[visualization] -U
!pip install -U accelerate
!pip install -U transformers
!pip install datasets

!pip install catboost
!pip install shap

In [2]:
# Auto reload of library
%load_ext autoreload
%autoreload 2

# System related and data input controls
import os

# Ignore the warnings
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

# Visualization
import matplotlib
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
## 한글 폰트 설치
!apt-get update -qq
!apt-get install fonts-nanum* -qq
## NanumGothic 폰트 경로 지정
fm.fontManager.addfont('/usr/share/fonts/truetype/nanum/NanumGothic.ttf')
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font_prop = fm.FontProperties(fname=font_path)
## 한글 폰트 설정
matplotlib.rcParams['font.family'] = font_prop.get_name()
plt.rc('font', family='NanumGothic')
sns.set(font=font_prop.get_name())
## 마이너스 표시 설정
plt.rcParams['axes.unicode_minus'] = False

# Understanding of Data
from ydata_profiling import ProfileReport
import missingno as msno

# Custom
## 사용자의 실제 작업경로로 설정!
# os.chdir('/content/drive/MyDrive/Research/Analysis/Lecture/특강_20250412_한국지능정보사회진흥원_빅데이터센터')
# !ls
from module_KK import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
E: 잠금 파일 /var/lib/apt/lists/lock 파일을 열 수 없습니다 - open (13: 허가 거부)
E: /var/lib/apt/lists/ 디렉터리를 잠글 수 없습니다
W: /var/cache/apt/pkgcache.bin 파일을 삭제하는데 문제가 있습니다 - RemoveCaches (13: 허가 거부)
W: /var/cache/apt/srcpkgcache.bin 파일을 삭제하는데 문제가 있습니다 - RemoveCaches (13: 허가 거부)
E: 잠금 파일 /var/lib/dpkg/lock-frontend 파일을 열 수 없습니다 - open (13: 허가 거부)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


2025-04-06 22:53:28.866694: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-06 22:53:28.879100: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743947608.894154    2080 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743947608.899070    2080 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-06 22:53:28.917270: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

# Hyperparameter

In [None]:
# Data Preprocessing
file_location = os.path.join(os.path.join('.', 'Data', 'df_concat_BA1.csv'))
Y_colname = '기부여부'
TEST_SIZE = 0.2
RANDOM_STATE = 123
SAMPLING_METHOD = 'RandomUnderSampler'
SAMPLING_STRATEGY = 'auto'
SCALER = MinMaxScaler()
LABEL_LIST = ['Non-donation', 'Donation']

# Modeling AI
OUTPUT_TYPE = 'logit'
MAX_DISPLAY = 30
DEPENDENCY = True

# BA Process Summary

In [None]:
# 데이터분석 프로세스
## 데이터 로딩
df = pd.read_csv(file_location, encoding='utf-8-sig')
## 데이터 전처리
X_train, X_test, Y_train, Y_test, df_prep = preprocessing_MDIS_KK(df)
X_colname = [col for col in df_prep.columns if col != Y_colname]
## 결과 확인
print(X_train.shape, Y_train.shape, X_train.min(), X_train.max())
print(X_test.shape, Y_test.shape, X_test.min(), X_test.max())
print('Complete!')

# Logistic Regression
model = LogisticRegression(fit_intercept=True, class_weight='balanced')
model.fit(X_train, Y_train)

# Explanation
explanation_SHAP_KK(model, X_train, X_test, X_colname,
                    MAX_DISPLAY=MAX_DISPLAY, model_type='linear',
                    link=OUTPUT_TYPE, sample_size=1,
                    sample_size_1000=1000,
                    plot_interaction=True)

# Prediction
P_trpred = pd.DataFrame(model.predict_proba(X_train)[:,-1],
                        index=Y_train.index, columns=['Pred'])
P_tepred = pd.DataFrame(model.predict_proba(X_test)[:,-1],
                        index=Y_test.index, columns=['Pred'])
Y_trpred = (P_trpred >= 0.5).astype(int)
Y_tepred = (P_tepred >= 0.5).astype(int)

# Evaluation
Score_te, Score_trte = prediction_class(model, X_train, Y_train, X_test, Y_test,
                                        LABEL_LIST=LABEL_LIST, ALGO_NAME='Logistic')
display(Score_te, Score_trte)

# Prediction Explanation

In [None]:
# 데이터분석 프로세스
## 데이터 로딩
df = pd.read_csv(file_location, encoding='utf-8-sig')
## 데이터 전처리
X_train, X_test, Y_train, Y_test, df_prep = preprocessing_MDIS_KK(df)
X_colname = [col for col in df_prep.columns if col != Y_colname]
## 결과 확인
print(X_train.shape, Y_train.shape, X_train.min(), X_train.max())
print(X_test.shape, Y_test.shape, X_test.min(), X_test.max())
print('Complete!')

## 1) Logistic Regression

In [None]:
# Logistic Regression
model = LogisticRegression(fit_intercept=True, class_weight='balanced')
model.fit(X_train, Y_train)

# # Explanation
# explanation_SHAP_KK(model, X_train, X_test, X_colname,
#                     MAX_DISPLAY=MAX_DISPLAY, model_type='linear',
#                     link=OUTPUT_TYPE, sample_size=1,
#                     sample_size_1000=1000,
#                     plot_interaction=True)

# Prediction
P_trpred = pd.DataFrame(model.predict_proba(X_train)[:,-1],
                        index=Y_train.index, columns=['Pred'])
P_tepred = pd.DataFrame(model.predict_proba(X_test)[:,-1],
                        index=Y_test.index, columns=['Pred'])
Y_trpred = (P_trpred >= 0.5).astype(int)
Y_tepred = (P_tepred >= 0.5).astype(int)

# Evaluation
Score_te, Score_trte = prediction_class(model, X_train, Y_train, X_test, Y_test,
                                        LABEL_LIST=LABEL_LIST, ALGO_NAME='Logistic')
display(Score_te, Score_trte)

## 2) Random Forest

In [None]:
# Random Forest
model = RandomForestClassifier(n_estimators=100,
                               class_weight='balanced',   # 'balanced_subsample'
                               random_state=123)
model.fit(X_train, Y_train)

# # Explanation
# explanation_SHAP_KK(model, X_train, X_test, X_colname,
#                     MAX_DISPLAY=MAX_DISPLAY, model_type='tree',
#                     link=OUTPUT_TYPE, sample_size=1,
#                     sample_size_1000=1000,
#                     plot_interaction=True)

# Prediction
P_trpred = pd.DataFrame(model.predict_proba(X_train)[:,-1],
                        index=Y_train.index, columns=['Pred'])
P_tepred = pd.DataFrame(model.predict_proba(X_test)[:,-1],
                        index=Y_test.index, columns=['Pred'])
Y_trpred = (P_trpred >= 0.5).astype(int)
Y_tepred = (P_tepred >= 0.5).astype(int)

# Evaluation
Score_te, Score_trte = prediction_class(model, X_train, Y_train, X_test, Y_test,
                                        LABEL_LIST=LABEL_LIST, ALGO_NAME='Logistic')
display(Score_te, Score_trte)

## 3) XGBoost

In [None]:
# XGBoost
model = XGBClassifier(n_estimators=100,
                      scale_pos_weight=Y_train.value_counts()[0]/Y_train.value_counts()[1],   # binary
                      random_state=123)
model.fit(X_train, Y_train)

# # Explanation
# explanation_SHAP_KK(model, X_train, X_test, X_colname,
#                     MAX_DISPLAY=MAX_DISPLAY, model_type='tree',
#                     link=OUTPUT_TYPE, sample_size=1,
#                     sample_size_1000=1000,
#                     plot_interaction=True)

# Prediction
P_trpred = pd.DataFrame(model.predict_proba(X_train)[:,-1],
                        index=Y_train.index, columns=['Pred'])
P_tepred = pd.DataFrame(model.predict_proba(X_test)[:,-1],
                        index=Y_test.index, columns=['Pred'])
Y_trpred = (P_trpred >= 0.5).astype(int)
Y_tepred = (P_tepred >= 0.5).astype(int)

# Evaluation
Score_te, Score_trte = prediction_class(model, X_train, Y_train, X_test, Y_test,
                                        LABEL_LIST=LABEL_LIST, ALGO_NAME='Logistic')
display(Score_te, Score_trte)

## 4) LightGBM

In [None]:
# LGBM
model = LGBMClassifier(n_estimators=100,
                       class_weight='balanced',
                       random_state=123)
model.fit(X_train, Y_train)

# # Explanation
# explanation_SHAP_KK(model, X_train, X_test, X_colname,
#                     MAX_DISPLAY=MAX_DISPLAY, model_type='tree',
#                     link=OUTPUT_TYPE, sample_size=1,
#                     sample_size_1000=1000,
#                     plot_interaction=True)

# Prediction
P_trpred = pd.DataFrame(model.predict_proba(X_train)[:,-1],
                        index=Y_train.index, columns=['Pred'])
P_tepred = pd.DataFrame(model.predict_proba(X_test)[:,-1],
                        index=Y_test.index, columns=['Pred'])
Y_trpred = (P_trpred >= 0.5).astype(int)
Y_tepred = (P_tepred >= 0.5).astype(int)

# Evaluation
Score_te, Score_trte = prediction_class(model, X_train, Y_train, X_test, Y_test,
                                        LABEL_LIST=LABEL_LIST, ALGO_NAME='Logistic')
display(Score_te, Score_trte)

## 5) CatBoost

In [None]:
# CatBoost
model = CatBoostClassifier(n_estimators=100,
                           auto_class_weights='Balanced',
                           allow_writing_files=False,
                           random_state=123)
model.fit(X_train, Y_train)

# # Explanation
# explanation_SHAP_KK(model, X_train, X_test, X_colname,
#                     MAX_DISPLAY=MAX_DISPLAY, model_type='tree',
#                     link=OUTPUT_TYPE, sample_size=1,
#                     sample_size_1000=1000,
#                     plot_interaction=True)

# Prediction
P_trpred = pd.DataFrame(model.predict_proba(X_train)[:,-1],
                        index=Y_train.index, columns=['Pred'])
P_tepred = pd.DataFrame(model.predict_proba(X_test)[:,-1],
                        index=Y_test.index, columns=['Pred'])
Y_trpred = (P_trpred >= 0.5).astype(int)
Y_tepred = (P_tepred >= 0.5).astype(int)

# Evaluation
Score_te, Score_trte = prediction_class(model, X_train, Y_train, X_test, Y_test,
                                        LABEL_LIST=LABEL_LIST, ALGO_NAME='Logistic')
display(Score_te, Score_trte)

## 6) MLP

In [None]:
# MLP
## reshape
if WEIGHT_METHOD != None:
    Y_train_dl, Y_test_dl = reshape_YtoOneHot(Y_train, Y_test)
else:
    Y_train_dl, Y_test_dl = Y_train.copy(), Y_test.copy()
X_train_dl, X_test_dl = X_train.copy(), X_test.copy()

## 모델링
ALGO_NAME='MLP'
model = modeling_MLP(X_train_dl, Y_train_dl,
                     node_MLP=NODE_MLP,
                     HIDDEN_ACTIVATION=HIDDEN_ACTIVATION, OUTPUT_ACTIVATION=OUTPUT_ACTIVATION,
                     REGULARIZER=REGULARIZER, DROPOUT_RATIO=DROPOUT_RATIO,
                     LOSS=LOSS, OPTIMIZER=OPTIMIZER, LEARNING_RATE=LEARNING_RATE)
model, FILENAME = learning(model, X_train_dl, X_test_dl, Y_train_dl,
                           WEIGHT_METHOD=WEIGHT_METHOD,
                           VALIDATION_SPLIT=VALIDATION_SPLIT, VALIDATION_DATA=VALIDATION_DATA,
                           BATCH_SIZE=BATCH_SIZE, EPOCHS=EPOCHS, VERBOSE=VERBOSE,
                           MONITOR=MONITOR, MONITOR_MODE=MONITOR_MODE, EARLYSTOP_PATIENT=EARLYSTOP_PATIENT,
                           shap=False, X_colname=X_colname, X_top_display=X_TOP_DISPLAY)
Score_te_mlp, Score_trte_mlp = prediction_class(model, X_train_dl, Y_train_dl, X_test_dl, Y_test_dl,
                                                LABEL_LIST=LABEL_LIST, ALGO_NAME=ALGO_NAME)
display(Score_te_mlp, Score_trte_mlp)

## 베스트 모델 로딩
model_mlp = load_model(FILENAME)
Score_te_mlp, Score_trte_mlp = prediction_class(model, X_train_dl, Y_train_dl, X_test_dl, Y_test_dl,
                                                LABEL_LIST=LABEL_LIST, ALGO_NAME=ALGO_NAME)
display(model_mlp, Score_te_mlp, Score_trte_mlp)

# # Explanation
# explanation_SHAP_KK(model, X_train, X_test, X_colname,
#                     MAX_DISPLAY=MAX_DISPLAY, model_type='tree',
#                     link=OUTPUT_TYPE, sample_size=1,
#                     sample_size_1000=1000,
#                     plot_interaction=True)

# Prediction
P_trpred = pd.DataFrame(model.predict_proba(X_train)[:,-1],
                        index=Y_train.index, columns=['Pred'])
P_tepred = pd.DataFrame(model.predict_proba(X_test)[:,-1],
                        index=Y_test.index, columns=['Pred'])
Y_trpred = (P_trpred >= 0.5).astype(int)
Y_tepred = (P_tepred >= 0.5).astype(int)

# Evaluation
Score_te, Score_trte = prediction_class(model, X_train, Y_train, X_test, Y_test,
                                        LABEL_LIST=LABEL_LIST, ALGO_NAME='Logistic')
display(Score_te, Score_trte)

## 7) CNN

In [None]:
# CNN
## reshape
if WEIGHT_METHOD != None:
    Y_train_dl, Y_test_dl = reshape_YtoOneHot(Y_train, Y_test)
else:
    Y_train_dl, Y_test_dl = Y_train.copy(), Y_test.copy()
X_train_dl, X_test_dl = reshape_X2Dto3D(X_train, X_test)

## 모델링
ALGO_NAME='CNN'
model = modeling_CNN1D(X_train_dl, Y_train_dl,
                       node_CNN1=NODE_CNN1,
                       node_CNN2=NODE_CNN2,
                       HIDDEN_ACTIVATION=HIDDEN_ACTIVATION, OUTPUT_ACTIVATION=OUTPUT_ACTIVATION,
                       KERNEL_SIZE=KERNEL_SIZE, STRIDE=STRIDE, PADDING=PADDING,
                       POOL_SIZE=POOL_SIZE, POOL_STRIDE=POOL_STRIDE,
                       REGULARIZER=REGULARIZER, DROPOUT_RATIO=DROPOUT_RATIO,
                       LOSS=LOSS, OPTIMIZER=OPTIMIZER, LEARNING_RATE=LEARNING_RATE)
model, FILENAME = learning(model, X_train_dl, X_test_dl, Y_train_dl,
                           WEIGHT_METHOD=WEIGHT_METHOD,
                           VALIDATION_SPLIT=VALIDATION_SPLIT, VALIDATION_DATA=VALIDATION_DATA,
                           BATCH_SIZE=BATCH_SIZE, EPOCHS=EPOCHS, VERBOSE=VERBOSE,
                           MONITOR=MONITOR, MONITOR_MODE=MONITOR_MODE, EARLYSTOP_PATIENT=EARLYSTOP_PATIENT,
                           shap=False, X_colname=X_colname, X_top_display=X_TOP_DISPLAY)
Score_te_cnn, Score_trte_cnn = prediction_class(model, X_train_dl, Y_train_dl, X_test_dl, Y_test_dl,
                                                LABEL_LIST=LABEL_LIST, ALGO_NAME=ALGO_NAME)
display(Score_te_cnn, Score_trte_cnn)

## 베스트 모델 로딩
model_mlp = load_model(FILENAME)
Score_te_mlp, Score_trte_mlp = prediction_class(model, X_train_dl, Y_train_dl, X_test_dl, Y_test_dl,
                                                LABEL_LIST=LABEL_LIST, ALGO_NAME=ALGO_NAME)
display(model_mlp, Score_te_mlp, Score_trte_mlp)

# # Explanation
# explanation_SHAP_KK(model, X_train, X_test, X_colname,
#                     MAX_DISPLAY=MAX_DISPLAY, model_type='tree',
#                     link=OUTPUT_TYPE, sample_size=1,
#                     sample_size_1000=1000,
#                     plot_interaction=True)

# Prediction
P_trpred = pd.DataFrame(model.predict_proba(X_train)[:,-1],
                        index=Y_train.index, columns=['Pred'])
P_tepred = pd.DataFrame(model.predict_proba(X_test)[:,-1],
                        index=Y_test.index, columns=['Pred'])
Y_trpred = (P_trpred >= 0.5).astype(int)
Y_tepred = (P_tepred >= 0.5).astype(int)

# Evaluation
Score_te, Score_trte = prediction_class(model, X_train, Y_train, X_test, Y_test,
                                        LABEL_LIST=LABEL_LIST, ALGO_NAME='Logistic')
display(Score_te, Score_trte)

## Performance Comparison

In [None]:
folder_location = os.path.join(os.getcwd(),'Result')
prediction_summary(folder_location=folder_location,
                   algonames=['Logistic Regression', 'Random Forest', 'XGBoost', 'LGBM', 'CatBoost', 'MLP', 'CNN'])


# Feature Explanation