### 1. Importing Packages and Loading Data

In [1]:
import pandas as pd
from zipfile import ZipFile
import os
 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import math

In [2]:
file_path = './iranian+churn+dataset.zip'

with ZipFile(file_path) as z:
    print(z.namelist())

['Customer Churn.csv']


In [3]:
filename = z.namelist()[0]
with ZipFile(file_path).open(filename) as f:
    df = pd.read_csv(f)

In [4]:
df.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0


In [5]:
df.drop(columns=['Age'], inplace=True) # Drop duplicate feature

### 2. Splitting Data into Train and Test Sets

In [6]:
from sklearn.model_selection import train_test_split

# Use stratified split
X = df.drop(columns = ['Churn']) # dataframe
y = df['Churn'] # series

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=312, stratify=y)

# Reset Index
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print(f'# of train samples: {len(X_train)}')
print(f'# of test samples: {len(X_test)}')

# of train samples: 2520
# of test samples: 630


In [7]:
X_train.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Customer Value
0,17,0,43,1,4683,80,19,42,1,1,1,366.465
1,7,0,40,0,1015,20,20,10,3,1,2,121.4
2,3,0,20,0,740,18,1,6,2,1,1,38.61
3,0,0,23,0,3165,58,3,21,3,1,1,140.92
4,0,0,35,0,0,0,0,0,2,1,2,0.0


In [8]:
y_train.head()

0    0
1    0
2    0
3    0
4    1
Name: Churn, dtype: int64

In [9]:
categorical_features = [c for c in X_train.columns if X_train[c].nunique() < 12]
categorical_features

['Complains', 'Charge  Amount', 'Age Group', 'Tariff Plan', 'Status']

In [10]:
continuous_features = [c for c in X_train.columns if c not in categorical_features]
continuous_features

['Call  Failure',
 'Subscription  Length',
 'Seconds of Use',
 'Frequency of use',
 'Frequency of SMS',
 'Distinct Called Numbers',
 'Customer Value']

### 3. Baseline Logistic Regression and Feature Importance

#### 3.1 Setting Custom Cross-Validation Methods

In [11]:
metric_list = ['accuracy', 'precision', 'recall', 'f1_score']

def matrix_to_metrics(mat):
    TP = mat[1, 1]
    TN = mat[0, 0]
    FP = mat[0, 1]
    FN = mat[1, 0]
    
    acc = (TP + TN) / (TP + TN + FP + FN)
    prec = TP / (TP + FP) 
    rec = TP / (TP + FN)
    f1_score = 2 * (prec * rec) / (prec + rec)
    
    return [acc, prec, rec, f1_score]

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import time
import datetime
from sklearn.metrics import confusion_matrix

In [13]:
def cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set
        
        # scaling
        if scaler is not None:   
            # previous parameters not retained
            X_trn = scaler.fit_transform(X_trn)  
            X_val = scaler.transform(X_val) 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [14]:
def get_summary(result_dict):
    cols = []
    avgs = []
    for key in result_dict:
        cols.append(key)
        avgs.append(result_dict[key][-1]['average'])
    return pd.DataFrame({'Metric': cols, 'Average': avgs}) 

In [15]:
def compare_models(df_dict): 
    n = len(df_dict)
    for idx, item in enumerate(df_dict.items()):
        model_name, result = item
        if idx == 0:
            merged_df = result.rename(columns={'Average': model_name})
        else:
            merged_df = pd.merge(merged_df, result.rename(columns={'Average': model_name}), on='Metric')

    merged_df = merged_df.set_index('Metric')
    merged_df['best_model'] = merged_df.idxmax(axis=1)
    return merged_df

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
def run_lr(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [18]:
from sklearn.compose import ColumnTransformer

def customize_scaler(to_scale, to_skip):
    scaler = ColumnTransformer(
                transformers=[('process', StandardScaler(), to_scale), 
                    ('skip', 'passthrough', to_skip)])
    return scaler

#### 3.2 Preliminary Results

In [19]:
skips = ['Complains', 'Tariff Plan', 'Status']
uses = [c for c in X_train.columns if c not in skips]
uses

['Call  Failure',
 'Subscription  Length',
 'Charge  Amount',
 'Seconds of Use',
 'Frequency of use',
 'Frequency of SMS',
 'Distinct Called Numbers',
 'Age Group',
 'Customer Value']

In [20]:
base_scaler = customize_scaler(uses, skips)
base_scaler

In [21]:
vanilla_dict = {}
vanilla_dict['data'] = [X_train, y_train]
vanilla_dict['class'] = LogisticRegression
vanilla_dict['param'] = {}
vanilla_dict['drop'] = []
vanilla_dict['scaler'] = base_scaler

In [22]:
wrapper = {'vanilla': vanilla_dict}
vanilla_lr = run_lr(wrapper)

0.045 seconds


In [23]:
vanilla_lr['vanilla']

Unnamed: 0,Metric,Average
0,accuracy,0.89
1,precision,0.786
2,recall,0.418
3,f1_score,0.539


#### 3.3 Feature Importance

In [24]:
import statsmodels.api as sm

In [25]:
def show_coef(data=[X_train, y_train], scaler=base_scaler):
    
    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    lr_coef, stat_coef = [], []
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set
        
        X_trn = scaler.fit_transform(X_trn)  
        X_val = scaler.transform(X_val) 
        
        # sklearn
        lr = LogisticRegression(penalty='none')
        lr.fit(X_trn, y_trn)
        lr_coef.append(abs(lr.coef_[0]))

        # stat
        X_trn_with_const = sm.add_constant(X_trn)  # 절편 추가
        stat_lr = sm.Logit(y_trn, X_trn_with_const).fit(disp=0)      
        stat_coef.append(np.abs(stat_lr.params[1:]))

    mean_lr_coef = np.mean(np.array(lr_coef), axis=0)
    mean_stat_coef = np.mean(np.array(stat_coef), axis=0)
    
    # 결과 출력
    coef_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Sklearn': mean_lr_coef,
        'Stat': mean_stat_coef
    })
    
    return coef_df

In [26]:
coef_df = show_coef()

In [27]:
coef_df.sort_values(by='Sklearn', ascending=False)

Unnamed: 0,Feature,Sklearn,Stat
5,Frequency of use,4.279884,4.281035
9,Tariff Plan,4.078896,4.078946
8,Age Group,3.208922,3.210262
4,Seconds of Use,2.874512,2.87479
11,Customer Value,1.343939,1.344012
0,Call Failure,0.86819,0.868236
3,Charge Amount,0.656174,0.655975
2,Subscription Length,0.637828,0.637782
10,Status,0.364513,0.363998
6,Frequency of SMS,0.330085,0.330061


In [28]:
drop_candidates = coef_df.sort_values(by='Sklearn', ascending=False)['Feature'][-5:].values.tolist()
drop_candidates

['Subscription  Length',
 'Status',
 'Frequency of SMS',
 'Complains',
 'Distinct Called Numbers']

#### 3.4 Feature Selection Based on CV Performance

In [29]:
drop_dicts = {}

for i in range(1, len(drop_candidates)+1):
    cur_drop = f'drop_{i}cols'
    drop_dicts[cur_drop] = {}
    cur_dict = drop_dicts[cur_drop]
    cur_dict['data'] = [X_train, y_train]
    cur_dict['class'] = LogisticRegression
    cur_dict['param'] = {'penalty':'none'}
    cur_dict['drop'] = drop_candidates[-i:]
    binaries = ['Complains', 'Tariff Plan', 'Status']
    skips = [c for c in binaries if c not in cur_dict['drop']]
    uses = [c for c in X_train.columns if c not in skips and c not in cur_dict['drop']]
    cur_dict['scaler'] = customize_scaler(uses, skips)

In [30]:
drop_result = run_lr(drop_dicts)

0.054 seconds
0.042 seconds
0.033 seconds
0.029 seconds
0.029 seconds


In [31]:
compare_models(drop_result)

Unnamed: 0_level_0,drop_1cols,drop_2cols,drop_3cols,drop_4cols,drop_5cols,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
accuracy,0.889,0.858,0.851,0.858,0.858,drop_1cols
precision,0.776,0.583,0.552,0.627,0.632,drop_1cols
recall,0.426,0.361,0.338,0.261,0.262,drop_1cols
f1_score,0.54,0.44,0.412,0.362,0.364,drop_1cols


In [32]:
drop_l2_dicts = {}

for i in range(0, 3):
    cur_drop = f'drop_{i}cols'
    drop_l2_dicts[cur_drop] = {}
    cur_dict = drop_l2_dicts[cur_drop]
    cur_dict['data'] = [X_train, y_train]
    cur_dict['class'] = LogisticRegression
    cur_dict['param'] = {}
    if i != 0:
        cur_dict['drop'] = drop_candidates[-i:]
    else:
        cur_dict['drop'] = []
    binaries = ['Complains', 'Tariff Plan', 'Status']
    skips = [c for c in binaries if c not in cur_dict['drop']]
    uses = [c for c in X_train.columns if c not in skips and c not in cur_dict['drop']]
    cur_dict['scaler'] = customize_scaler(uses, skips)

In [33]:
drop_l2_result = run_lr(drop_l2_dicts)

0.041 seconds
0.041 seconds
0.036 seconds


In [34]:
compare_models(drop_l2_result)

Unnamed: 0_level_0,drop_0cols,drop_1cols,drop_2cols,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,0.89,0.891,0.852,drop_1cols
precision,0.786,0.808,0.56,drop_1cols
recall,0.418,0.415,0.311,drop_0cols
f1_score,0.539,0.541,0.393,drop_1cols


### 4. PCA Feature Generation
	• EDA identified variables with linear relationships.
	• Applied PCA to these variables and combined the resulting feature with the original dataset.

In [35]:
from sklearn.decomposition import PCA

In [36]:
def apply_PCA(data, num_comp):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    pca = PCA(n_components=num_comp)
    pcs = pca.fit_transform(scaled_data)
    cols = [f'PC{i}' for i in range(1, num_comp+1)]
    pca_df = pd.DataFrame(data=pcs, columns=cols)
    return pca_df

In [37]:
corr_cols = ['Seconds of Use', 'Frequency of use', 'Frequency of SMS', 'Customer Value']
pca_cols = apply_PCA(X_train[corr_cols], 2)
other_cols = X_train[[col for col in X_train.columns if col not in corr_cols]]

PCA_X_train = pd.concat([pca_cols, other_cols], axis=1)
PCA_X_train.head()

Unnamed: 0,PC1,PC2,Call Failure,Complains,Subscription Length,Charge Amount,Distinct Called Numbers,Age Group,Tariff Plan,Status
0,-0.198412,-0.493741,17,0,43,1,42,1,1,1
1,-1.425834,0.216111,7,0,40,0,10,3,1,2
2,-1.633379,0.093897,3,0,20,0,6,2,1,1
3,-0.87182,-0.41057,0,0,23,0,21,3,1,1
4,-1.926092,0.287207,0,0,35,0,0,2,1,2


In [38]:
pca_wrapper = {}
pca_wrapper['pca'] = {}
cur_dict = pca_wrapper['pca'] 
cur_dict['data'] = [PCA_X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {}
binaries = ['Complains', 'Tariff Plan', 'Status']
skips = binaries + pca_cols.columns.tolist()
uses = [c for c in PCA_X_train.columns if c not in skips]
cur_dict['scaler'] = customize_scaler(uses, skips)

In [39]:
cur_dict['scaler']

In [40]:
pca_result = run_lr(pca_wrapper)

0.032 seconds


In [41]:
pca_result['pca']

Unnamed: 0,Metric,Average
0,accuracy,0.891
1,precision,0.787
2,recall,0.43
3,f1_score,0.551


### 5. Spline Transformation

In [42]:
from sklearn.preprocessing import SplineTransformer

In [43]:
def spline_cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set

        # spline & scaler transformation
        X_trn = scaler.fit_transform(X_trn) 
        X_val = scaler.transform(X_val) 

        # 원래 변환 전 칼럼을 드랍해야 함 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [44]:
def run_spline(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = spline_cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [45]:
binaries = ['Complains', 'Tariff Plan', 'Status']
discretes = [c for c in X_train.columns if c not in binaries and c not in continuous_features]

In [46]:
from sklearn.pipeline import Pipeline
# spline + scaler
spline_scaler_pipeline = Pipeline([
    ('spline', SplineTransformer(degree=3, n_knots=4)),  # Spline 변환
    ('scaler', StandardScaler()) 
])

spline_scaler = ColumnTransformer(
                transformers=[('both', spline_scaler_pipeline, continuous_features), 
                              ('scaler_only', StandardScaler(), discretes),
                              ('skip', 'passthrough', binaries)])

In [47]:
spline_scaler

In [48]:
sample_X = X_train.sample(10, random_state=42)
sample_X.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Customer Value
196,3,1,39,1,60,3,5,2,2,1,1,25.335
986,2,1,33,0,1960,48,52,33,3,1,2,288.32
1090,2,0,26,0,1323,36,28,68,4,1,1,103.975
387,11,0,40,0,15665,190,161,35,2,1,1,1437.975
2221,0,0,3,0,1103,11,0,1,2,1,1,50.13


In [49]:
spline_sample_X = spline_scaler.fit_transform(sample_X)

In [50]:
sample_X.shape

(10, 12)

In [51]:
len(continuous_features)

7

In [52]:
spline_sample_X.shape

(10, 47)

In [53]:
pd.DataFrame(spline_sample_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,-0.635823,0.227795,1.040435,-0.538567,-0.440303,-0.333424,-0.333333,-0.333562,-0.553495,-0.417461,...,0.738015,-0.507647,-0.459447,-0.440907,-0.334035,0.5,-0.654654,1.0,1.0,1.0
1,-0.262911,0.638359,0.444685,-0.632513,-0.440303,-0.333424,-0.333333,-0.333562,-0.182001,1.27272,...,-0.13904,1.662688,-0.309798,-0.440907,-0.334035,-0.75,0.436436,1.0,1.0,2.0
2,-0.262911,0.638359,0.444685,-0.632513,-0.440303,-0.333424,-0.333333,-0.331505,2.405941,1.979277,...,0.638292,0.085398,-0.455446,-0.440907,-0.334035,-0.75,1.527525,0.0,1.0,1.0
3,-0.830509,-1.365887,-0.860671,2.311302,0.550959,-0.33261,-0.333333,-0.333562,-0.554581,-0.655062,...,-1.862796,-1.546766,0.399587,2.818153,2.999995,-0.75,-0.654654,0.0,1.0,1.0
4,1.494372,1.052908,-0.713821,-0.672069,-0.440303,-0.333424,3.0,2.999999,1.483242,-1.564134,...,0.727483,-0.335309,-0.459322,-0.440907,-0.334035,-0.75,-0.654654,0.0,1.0,1.0
5,-0.830509,-1.2903,0.850406,1.48447,-0.336916,-0.333424,-0.333333,-0.333562,-0.554581,-0.655062,...,0.692409,-0.126515,-0.458264,-0.440907,-0.334035,1.75,1.527525,1.0,1.0,1.0
6,-0.830509,-1.365887,-1.548076,0.078032,2.868077,3.0,-0.333333,-0.333562,-0.554581,-0.655062,...,-0.326658,1.919246,-0.243057,-0.440907,-0.334035,1.75,0.436436,0.0,1.0,1.0
7,1.494372,1.052908,-0.713821,-0.672069,-0.440303,-0.333424,-0.333333,-0.333562,-0.418801,0.71708,...,0.691867,-0.124003,-0.458242,-0.440907,-0.334035,-0.75,-1.745743,0.0,1.0,1.0
8,-0.829942,-0.641164,1.77,-0.054005,-0.440303,-0.333424,-0.333333,-0.333562,-0.525252,0.130427,...,-1.862796,-0.847617,2.902656,0.7091,-0.327718,0.5,0.436436,0.0,1.0,1.0
9,1.494372,1.052908,-0.713821,-0.672069,-0.440303,-0.333424,-0.333333,-0.333562,-0.545891,-0.152724,...,0.703223,-0.179475,-0.458668,-0.440907,-0.334035,-0.75,-0.654654,0.0,1.0,1.0


In [54]:
spline_wrapper = {}
spline_wrapper['spline'] = {}
cur_dict = spline_wrapper['spline'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'max_iter': 300}
cur_dict['scaler'] = spline_scaler

In [55]:
spline_result = run_spline(spline_wrapper)

0.107 seconds


In [56]:
spline_result['spline']

Unnamed: 0,Metric,Average
0,accuracy,0.922
1,precision,0.817
2,recall,0.653
3,f1_score,0.723


### 6. Penalty and Solver Comparison

In [57]:
comp_wrapper = {}

In [58]:
comp_wrapper['saga_spline'] = {}    
cur_dict = comp_wrapper['saga_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [59]:
comp_wrapper['l1_saga_spline'] = {}    
cur_dict = comp_wrapper['l1_saga_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'penalty':'l1', 'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [60]:
comp_results = run_spline(comp_wrapper) # l1_saga_fold3_convergence warning

5.953 seconds
7.925 seconds


In [61]:
compare_models(comp_results)

Unnamed: 0_level_0,saga_spline,l1_saga_spline,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accuracy,0.921,0.921,saga_spline
precision,0.815,0.808,saga_spline
recall,0.648,0.653,l1_saga_spline
f1_score,0.719,0.719,saga_spline


### 7. Handling Class Imbalance

#### 7.1 SMOTE

In [62]:
# Pipeline-Wrapped Code for Reference
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import cross_validate as sk_cv

pipe = Pipeline(steps = [('smote', SMOTENC(categorical_features, random_state=42)), 
                      ('base_scaler', base_scaler),
                      ('logisticregression', LogisticRegression())])
pipe.fit(X_train, y_train)

# cross validation using intra-fold sampling
sk_cv(pipe, X_train, y_train)

{'fit_time': array([0.06233716, 0.06191421, 0.06287599, 0.06179571, 0.06166506]),
 'score_time': array([0.00101185, 0.0010159 , 0.00101924, 0.00101519, 0.00102496]),
 'test_score': array([0.83333333, 0.84722222, 0.85119048, 0.83531746, 0.84920635])}

In [63]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [64]:
np.mean(np.array([0.83168317, 0.82630273, 0.88337469, 0.85111663, 0.85111663]))

0.84871877

In [65]:
def smote_cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set

        # smote applying on only training set
        smote_nc = SMOTENC(categorical_features, random_state=42)
        X_trn, y_trn = smote_nc.fit_resample(X_trn, y_trn)
        
        # scaling
        # previous parameters not retained
        X_trn = scaler.fit_transform(X_trn) 
        X_val = scaler.transform(X_val) 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [66]:
def run_smote(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = smote_cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [67]:
smote_wrapper = {'smote': {}}
cur_dict = smote_wrapper['smote']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['drop'] = []
cur_dict['scaler'] = base_scaler 

In [68]:
smote_wrapper['smote_spline'] = {}
cur_dict = smote_wrapper['smote_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['drop'] = []
cur_dict['scaler'] = spline_scaler

In [69]:
smote_result = run_smote(smote_wrapper) 
compare_models(smote_result)

0.506 seconds
7.907 seconds


Unnamed: 0_level_0,smote,smote_spline,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accuracy,0.846,0.867,smote_spline
precision,0.511,0.549,smote_spline
recall,0.856,0.881,smote_spline
f1_score,0.638,0.674,smote_spline


#### 7.2 Model Weight Adjustment

In [70]:
weight_wrapper = {}
weight_wrapper['weight'] = {}
cur_dict = weight_wrapper['weight'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000, 'class_weight':'balanced'}
binaries = ['Complains', 'Tariff Plan', 'Status']
cur_dict['scaler'] = base_scaler

In [71]:
weight_smote_wrapper = {}
weight_smote_wrapper['smote'] = {}
cur_dict = weight_smote_wrapper['smote'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000, 'class_weight':'balanced'}
binaries = ['Complains', 'Tariff Plan', 'Status']
cur_dict['scaler'] = base_scaler

In [72]:
weight_only = run_lr(weight_wrapper)
weight_only['weight']

0.109 seconds


Unnamed: 0,Metric,Average
0,accuracy,0.843
1,precision,0.503
2,recall,0.886
3,f1_score,0.64


In [73]:
weight_smote = run_smote(weight_smote_wrapper)
weight_smote['smote']

0.449 seconds


Unnamed: 0,Metric,Average
0,accuracy,0.846
1,precision,0.511
2,recall,0.856
3,f1_score,0.638


### 8. Final Results 

#### 8.1 Summary of CV Results

In [74]:
drops = compare_models(drop_l2_result).drop(columns=['best_model'])
drops

Unnamed: 0_level_0,drop_0cols,drop_1cols,drop_2cols
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accuracy,0.89,0.891,0.852
precision,0.786,0.808,0.56
recall,0.418,0.415,0.311
f1_score,0.539,0.541,0.393


In [75]:
model_adj = compare_models(comp_results).drop(columns=['best_model'])
model_adj

Unnamed: 0_level_0,saga_spline,l1_saga_spline
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.921,0.921
precision,0.815,0.808
recall,0.648,0.653
f1_score,0.719,0.719


In [76]:
smotes = compare_models(smote_result).drop(columns=['best_model'])
smotes

Unnamed: 0_level_0,smote,smote_spline
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.846,0.867
precision,0.511,0.549
recall,0.856,0.881
f1_score,0.638,0.674


In [77]:
df_dict = {'vanilla': vanilla_lr['vanilla'], 
          'pca': pca_result['pca'],
          'spline': spline_result['spline'], 
         'weight': weight_only['weight'], 'smote_weight': weight_smote['smote']}

In [78]:
merged_df = pd.DataFrame()
for key, df in df_dict.items():
    df_renamed = df.rename(columns={'Average': key})
    if merged_df.empty:
        merged_df = df_renamed
    else:
        merged_df = pd.merge(merged_df, df_renamed, on='Metric')

In [79]:
merged_df

Unnamed: 0,Metric,vanilla,pca,spline,weight,smote_weight
0,accuracy,0.89,0.891,0.922,0.843,0.846
1,precision,0.786,0.787,0.817,0.503,0.511
2,recall,0.418,0.43,0.653,0.886,0.856
3,f1_score,0.539,0.551,0.723,0.64,0.638


In [80]:
merged_df

Unnamed: 0,Metric,vanilla,pca,spline,weight,smote_weight
0,accuracy,0.89,0.891,0.922,0.843,0.846
1,precision,0.786,0.787,0.817,0.503,0.511
2,recall,0.418,0.43,0.653,0.886,0.856
3,f1_score,0.539,0.551,0.723,0.64,0.638


In [81]:
from functools import reduce

dfs = [merged_df, drops, model_adj, smotes]  # 합치려는 데이터프레임 리스트
merged_df_final = reduce(lambda left, right: pd.merge(left, right, on='Metric'), dfs)

In [82]:
merged_df_final.set_index('Metric', inplace=True)
merged_df_final['best_model'] = merged_df_final.idxmax(axis=1)

In [83]:
merged_df_final

Unnamed: 0_level_0,vanilla,pca,spline,weight,smote_weight,drop_0cols,drop_1cols,drop_2cols,saga_spline,l1_saga_spline,smote,smote_spline,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
accuracy,0.89,0.891,0.922,0.843,0.846,0.89,0.891,0.852,0.921,0.921,0.846,0.867,spline
precision,0.786,0.787,0.817,0.503,0.511,0.786,0.808,0.56,0.815,0.808,0.511,0.549,spline
recall,0.418,0.43,0.653,0.886,0.856,0.418,0.415,0.311,0.648,0.653,0.856,0.881,weight
f1_score,0.539,0.551,0.723,0.64,0.638,0.539,0.541,0.393,0.719,0.719,0.638,0.674,spline


#### 8.2 Test Set Logistic Regression Results

In [84]:
final_wrapper = {'saga_spline': {}}
cur_dict = final_wrapper['saga_spline']
cur_dict['data'] = [X_train, y_train, X_test, y_test]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [85]:
def train(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    [X_train, y_train, X_test, y_test] = data
    
    start_time = time.time()
    
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)

    model = model_class(**model_params)
    model.fit(X_train, y_train)
        
    y_pred = model.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)
    final_result = matrix_to_metrics(conf_mat)
    for i, val in enumerate(final_result):
        result_dict[metrics[i]].append(round(final_result[i], 4))
        
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    return result_dict

In [86]:
def run(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = train(data, model_class, model_params, drops, scaler)
        result_dict[name] = single_result 
    return result_dict

In [87]:
final_result = run(final_wrapper)['saga_spline']

1.416 seconds


In [88]:
pd.DataFrame(final_result)

Unnamed: 0,accuracy,precision,recall,f1_score
0,0.9413,0.8523,0.7576,0.8021
