## 1. import dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 
import ast 
import matplotlib
import joblib
import warnings 
warnings.filterwarnings(action = 'ignore')

# evaluate multinomial logistic regression model
from numpy import mean
from numpy import std

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RepeatedStratifiedKFold

# xgboost 관련
from xgboost import XGBClassifier
from xgboost import plot_importance

# lightgbm 관련
from lightgbm import LGBMClassifier
from lightgbm import plot_importance

In [None]:
dataset = pd.read_csv('combined_data.csv', encoding = "cp949")
dataset.head()

In [None]:
# dataset = dataset.astype(str)
# dataset.dtypes

In [None]:
# factor로 변환
dataset["age"] = dataset["age"].astype("category")
dataset["twin"] = dataset["twin"].astype("category")
dataset["PIH임신중고혈압"] = dataset["PIH임신중고혈압"].astype("category")
dataset["bmi"] = dataset["bmi"].astype("category")
dataset["전자간증"] = dataset["전자간증"].astype("category")
dataset["고혈압"] = dataset["고혈압"].astype("category")
dataset["산과력_출산력P"] = dataset["산과력_출산력P"].astype("category")
dataset["산과력_출산력A"] = dataset["산과력_출산력A"].astype("category")
dataset["수축억제제"] = dataset["수축억제제"].astype("category")
dataset["저체중아"] = dataset["저체중아"].astype("category")
dataset["태아성장지연"] = dataset["태아성장지연"].astype("category")
dataset["태반조기박리"] = dataset["태반조기박리"].astype("category")
dataset["부인과수술력"] = dataset["부인과수술력"].astype("category")
dataset["자궁봉축술"] = dataset["자궁봉축술"].astype("category")
dataset["입원총기간"] = dataset["입원총기간"].astype("category")
dataset["입원횟수"] = dataset["입원횟수"].astype("category")
dataset["첫투약시기"] = dataset["첫투약시기"].astype("category")
dataset["outcome"] = dataset["outcome"].astype("category")

In [None]:
# Checking for null values
print(dataset.info())

# Checking for outliers
print(dataset.describe())

In [None]:
dataset.isna().sum()

In [None]:
dataset.dtypes

# one-hot-encoding

In [None]:
var = ['twin','전자간증','PIH임신중고혈압','고혈압','산과력_출산력P', '산과력_출산력A','수축억제제','저체중아','태아성장지연',
       '태반조기박리','부인과수술력','자궁봉축술','입원횟수','첫투약시기','age','bmi','입원총기간']

encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(dataset[var]).toarray(), columns = encoder.get_feature_names())
df = pd.concat([onehot, dataset], axis = 1).drop(columns = var)
df


# 2. split train / test set 

In [None]:
np.random.seed(0)
df_train, df_test = train_test_split(df ,train_size =0.75, test_size = 0.25, random_state = 100)


In [None]:
# X_train, y_train 나누기 
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1]

X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) #12081, 4028 

# 3. multinomial logistic regreesion

In [None]:
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter = 100)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

model.fit(X_train, y_train)
y_hat = model.predict(X_test)
print(classification_report(y_test, y_hat))

# 4. xgboost model 

In [None]:
tic = time.time() #시작 시간
#---------------------
# XGBoost 예측모형
xgb_model = XGBClassifier(n_estimators = 500, 
                          random_state = 156, 
                          objective = "multi:softmax", 
                          num_class = 3, 
                          enable_categorical=True)

y_pred = xgb_model.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred))

#---------------------
toc = time.time() # 끝 시간
print('time elapsed:', toc - tic) 


In [None]:
# tic = time.time()
# #-------------------
# # Grid Search
# xgb = XGBClassifier(n_estimators = 100, 
#                     random_state = 156,
#                     objective = "multi:softmax", 
#                     num_class = 3, 
#                     enable_categorical=True)

# # 하이퍼파라미터 후보
# params = {'max_depth':[3,10], 
#           'min_child_weight':[1,3],
#           'colsample_bytree':[0.25, 0.5, 0.75]}

# # folds = 3
# gridcv = GridSearchCV(xgb, param_grid = params, cv = 3)
# gridcv.fit(X_train, y_train, early_stopping_rounds = 30, eval_metric = "merror",
#           eval_set = [(X_train, y_train), (X_test, y_test)])

# y_pred_1 = gridcv.predict(X_test)
# #-------------------
# toc = time.time() 
# print(classification_report(y_test, y_pred_1))
# print('Optimized hyperparameters', gridcv.best_params_) 
# print('time elapsed:', toc - tic) 

In [None]:
# hiperparameter
tic = time.time()

xgb_final = XGBClassifier(colsample_bytree = 0.25, 
                    max_depth = 10, 
                    min_child_weight = 1,
                    n_estimators = 1000, 
                    random_state = 156,
                    learning_rate = 0.02, 
                    reg_alpha = 0.03,
                    objective = "multi:softmax", 
                    num_class = 3, 
                    enable_categorical=True)

y_pred_2 = xgb_final.fit(X_train, y_train).predict(X_test)

#---------------------
toc = time.time() # 끝 시간
print('time elapsed:', toc - tic) 
print(classification_report(y_test, y_pred_2))
  
joblib.dump(xgb_final, 'trained_model_full') 

In [None]:
xgb_final.feature_importances_

In [None]:
import seaborn as sns
plt.rc('font', family = 'NanumBarunGothic')

# 배열형태로 반환
ft_importance_values = xgb_final.feature_importances_

# 정렬과 시각화를 쉽게 하기 위해 series 전환
ft_series = pd.Series(ft_importance_values, index = X_train.columns)
ft_top20 = ft_series.sort_values(ascending=False)[:20]

# 시각화
plt.figure(figsize=(10,8))
plt.title('Feature Importance Top 20')
sns.barplot(x=ft_top20, y=ft_top20.index)
plt.show()

In [None]:
plt.rc('font', family = 'NanumBarunGothic')

# 정렬과 시각화를 쉽게 하기 위해 series 전환
ft_series = pd.Series(ft_importance_values, index = X_train.columns)
ft_all= ft_series.sort_values(ascending=False)

# 시각화
plt.figure(figsize=(20,15))
plt.title('Feature Importance ALL')
sns.barplot(x=ft_all, y=ft_all.index)
plt.show()