# 1. dataset 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from time import time 

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/OBGY_modeling/project1_model_dataset.csv', encoding = "euc-kr")
dataset.head()

In [None]:
# Checking for null values
print(dataset.info())

# Checking for outliers
print(dataset.describe())

In [None]:
# factor로 변환
# 성별, twin, 전자간증, PIH임신중고혈압, 고혈압, 산과력_출산력A, 산과력_출산력P, 응급실_횟수, 입원_횟수, 첫투약시기, outcome
  
dataset['twin'] = dataset['twin'].astype('category')
dataset['전자간증'] = dataset['전자간증'].astype('category')
dataset['PIH임신중고혈압'] = dataset['PIH임신중고혈압'].astype('category')
dataset['고혈압'] = dataset['고혈압'].astype('category')
dataset['산과력_출산력A'] = dataset['산과력_출산력A'].astype('category')
dataset['산과력_출산력P'] = dataset['산과력_출산력P'].astype('category')
dataset['수축억제제'] = dataset['수축억제제'].astype('category')
dataset['저체중아'] = dataset['저체중아'].astype('category')
dataset['태아성장지연'] = dataset['태아성장지연'].astype('category')
dataset['태반조기박리'] = dataset['태반조기박리'].astype('category')
dataset['부인과수술력'] = dataset['부인과수술력'].astype('category')
dataset['자궁봉축술'] = dataset['자궁봉축술'].astype('category')
dataset['입원횟수'] = dataset['입원횟수'].astype('category')
dataset['첫투약시기'] = dataset['첫투약시기'].astype('category')

In [None]:
dataset.dtypes

In [None]:
dataset.isna().sum()

# 2. 데이터 전처리 

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0)
df_train, df_test = train_test_split(dataset,train_size =0.75, test_size = 0.25, random_state = 100)

In [None]:
# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
# numeric 변수 scale 
sc = StandardScaler()  #평균 0 , 분산 1로 조정
scaler = MinMaxScaler()

num_vars = ['age','입원총기간','bmi','outcome']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_test[num_vars] = scaler.fit_transform(df_test[num_vars])

df_train

In [None]:
# X_train, y_train 나누기 
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1]

X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# 3. regression - benchmark model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import ARDRegression,BayesianRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import ast #convert string to function
import statsmodels.api as sm 

In [None]:
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

print(lr_1.summary())

# Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Running RFE with the output number of the variable equal to 10
if __name__ == "__main__":
    
    lr = LinearRegression()

    rfe = RFE(lr, step = 10) 
    fit = rfe.fit(X_train, y_train)
    print("Features: {features}".format(features=X_train.columns))
    print("Num Features: {number_features}".format(number_features=fit.n_features_))
    print("Selected Features: {support}".format(support=fit.support_))
    print("Feature Ranking: {ranking}".format(ranking=fit.ranking_))

    selected_columns = [column for column, selected in zip(X_train.columns, fit.support_) if selected]
    print("Selected columns: {selected}".format(selected = selected_columns))


In [None]:
# Creating X_test dataframe with RFE selected variables
col = ['twin', 'bmi', '전자간증', '태아성장지연', '태반조기박리', '자궁봉축술', '입원총기간', '첫투약시기']
X_train_rfe = X_train[col]

# Adding a constant variable 
X_train_rfe = sm.add_constant(X_train_rfe)

lm = sm.OLS(y_train,X_train_rfe).fit()   # Running the linear model

print(lm.summary())

In [None]:
# 자궁봉축술 제거 
X_train_new = X_train_rfe.drop(["자궁봉축술"], axis = 1)

# Adding a constant variable 
X_train_lm = sm.add_constant(X_train_new)

lm = sm.OLS(y_train,X_train_lm).fit()   # Running the linear model

print(lm.summary())

In [None]:
# VIF = 다중공선성 확인
X_train_new = X_train_new.drop(['const'], axis=1)

# Calculate the VIFs for the new model
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_new
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# predict - test data

# Creating X_test_new dataframe by dropping variables from X_test
X_test_new = X_test[X_train_new.columns]

# Adding a constant variable 
X_test_new = sm.add_constant(X_test_new)

# Making predictions
y_pred = lm.predict(X_test_new)

r2_score(y_true = y_test, y_pred = y_pred) #0.17

In [None]:
import ast 
# model
def my_regressor(df_X, df_y):
  linear = LinearRegression()
  ridge, lasso, elasticnet = Ridge(), Lasso(), ElasticNet()
  ardr_linear, baysian_ridge = ARDRegression(), BayesianRidge()
  xgboost = XGBRegressor(enable_categorical=True)
  svc = SVR(kernel = 'rbf')
  random = RandomForestRegressor(n_estimators = 10, random_state = 0)
  decision = DecisionTreeRegressor(random_state = 0)  

  my_model_list = ['linear','ridge','lasso','elasticnet','ardr_linear','baysian_ridge',
                   'xgboost','svc','random','decision']
  score_dic = dict()

  for model_nm in my_model_list:
    scores = cross_val_score(eval(model_nm), df_X, df_y,scoring = "neg_mean_squared_error") #cv =5
    rmse_score = np.sqrt(-scores)
    rmse_sm = rmse_score.mean()
    score_dic[model_nm] = rmse_sm
  
  score_dic = sorted(score_dic.items(),key = lambda t :t[1])
  
  score_dic = pd.DataFrame()
  return score_dic


my_regressor(X_train, y_train)

# Machine Learning Modeling

In [None]:
# Function to fit the regressor and record performance metrics
def pipeline(reg, X_train, y_train, X_test, y_test, **kwargs):
    
    # Dictionary to hold properties of Models
    reg_props = {}
    
    # Initialize and fit the regressor, and time taken
    regressor = reg(**kwargs)
    start = time()
    regressor.fit(X_train, y_train)
    end = time()
    
    # Store the metrics for the regressor
    reg_props["name"] = reg.__name__
    reg_props["train_time"] = end - start
    reg_props["train_score"] = regressor.score(X_train, y_train)
    reg_props["test_score"] = regressor.score(X_test, y_test)
    reg_props["rmse"] = np.sqrt(mean_squared_error(y_test, regressor.predict(X_test)))
    reg_props["MAE"] = mean_absolute_error(y_test, regressor.predict(X_test))
    
    return reg_props

In [None]:
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

# Function to execute each algorithm through the pipeline
def execute_pipeline():
    # Create the list of algorithms
    regressors = [
        LinearRegression,
        Ridge,
        Lasso,
        ElasticNet,
        SVR,
        KNeighborsRegressor,
        RandomForestRegressor,
        GradientBoostingRegressor,
        ExtraTreesRegressor,
            ]
    
    # To store the properties for each regressor
    props = []
    
    """
    Iterate thorugh the list of regressors,
    passing each thorugh the pipeline and
    storing its properites
    """ 
    for reg in regressors:
        properites = pipeline(reg, X_train, y_train, X_test, y_test)
        props.append(properites)
        
    return props

In [None]:
def get_properties():
    # Obtain the properties after executing the pipeline
    properties = execute_pipeline()
    
    # Extract each individual property of the Regressors
    names = [prop["name"] for prop in properties]
    train_times = [prop["train_time"] for prop in properties]
    train_scores = [prop["train_score"] for prop in properties]
    test_scores = [prop["test_score"] for prop in properties]
    rmse_vals = [prop["rmse"] for prop in properties]
    mae_vals = [prop["MAE"] for prop in properties]

        
    # Create a DataFrame from these properties
    df = pd.DataFrame(index=names, 
                    data = {
                            "Training Times": train_times,
                            "Training Scores": train_scores,
                            "Testing Scores": test_scores,
                            "RMSE": rmse_vals,
                            "MAE":mae_vals
                      }
                  )
    
    return df

# Obain the properties in a structured DataFrame after executing the pipeline
properties = get_properties()
properties 