### 1. Importing Packages and Loading Data

In [1]:
import pandas as pd
from zipfile import ZipFile
import os
 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import math

In [2]:
file_path = './iranian+churn+dataset.zip'

with ZipFile(file_path) as z:
    print(z.namelist())

['Customer Churn.csv']


In [3]:
filename = z.namelist()[0]
with ZipFile(file_path).open(filename) as f:
    df = pd.read_csv(f)

In [4]:
df.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0


In [5]:
df.drop(columns=['Age'], inplace=True) # Drop duplicate feature

### 2. Splitting Data into Train and Test Sets

In [6]:
from sklearn.model_selection import train_test_split

# Use stratified split
X = df.drop(columns = ['Churn']) # dataframe
y = df['Churn'] # series

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Reset Index
X_train.reset_index(drop=True, inplace=True)
X_valid.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_valid.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print(f'# of train samples: {len(X_train)}')
print(f'# of valid samples: {len(X_valid)}')
print(f'# of test samples: {len(X_test)}')

# of train samples: 2016
# of valid samples: 504
# of test samples: 630


In [7]:
X_train.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Customer Value
0,18,0,22,2,6392,147,93,43,2,2,1,712.755
1,5,0,22,0,3530,61,7,24,3,1,1,171.64
2,6,0,35,0,2665,47,143,19,2,1,1,765.54
3,0,0,22,0,210,15,0,11,2,1,1,10.125
4,14,0,38,2,7428,86,116,37,2,1,1,860.13


In [8]:
y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: Churn, dtype: int64

In [9]:
categorical_features = [c for c in X_train.columns if X_train[c].nunique() < 12]
categorical_features

['Complains', 'Charge  Amount', 'Age Group', 'Tariff Plan', 'Status']

In [10]:
continuous_features = [c for c in X_train.columns if c not in categorical_features]
continuous_features

['Call  Failure',
 'Subscription  Length',
 'Seconds of Use',
 'Frequency of use',
 'Frequency of SMS',
 'Distinct Called Numbers',
 'Customer Value']

### 3. Baseline Logistic Regression and Feature Importance

#### 3.1 Setting Custom Cross-Validation Methods

In [11]:
metric_list = ['accuracy', 'precision', 'recall', 'f1_score']

def matrix_to_metrics(mat):
    TP = mat[1, 1]
    TN = mat[0, 0]
    FP = mat[0, 1]
    FN = mat[1, 0]
    
    acc = (TP + TN) / (TP + TN + FP + FN)
    prec = TP / (TP + FP) 
    rec = TP / (TP + FN)
    f1_score = 2 * (prec * rec) / (prec + rec)
    
    return [acc, prec, rec, f1_score]

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import time
import datetime
from sklearn.metrics import confusion_matrix

In [13]:
def cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set
        
        # scaling
        if scaler is not None:   
            # previous parameters not retained
            X_trn = scaler.fit_transform(X_trn)  
            X_val = scaler.transform(X_val) 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [14]:
def get_summary(result_dict):
    cols = []
    avgs = []
    for key in result_dict:
        cols.append(key)
        avgs.append(result_dict[key][-1]['average'])
    return pd.DataFrame({'Metric': cols, 'Average': avgs}) 

In [15]:
def compare_models(df_dict): 
    n = len(df_dict)
    for idx, item in enumerate(df_dict.items()):
        model_name, result = item
        if idx == 0:
            merged_df = result.rename(columns={'Average': model_name})
        else:
            merged_df = pd.merge(merged_df, result.rename(columns={'Average': model_name}), on='Metric')

    merged_df = merged_df.set_index('Metric')
    merged_df['best_model'] = merged_df.idxmax(axis=1)
    return merged_df

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
def run_lr(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [18]:
from sklearn.compose import ColumnTransformer

def customize_scaler(to_scale, to_skip):
    scaler = ColumnTransformer(
                transformers=[('process', StandardScaler(), to_scale), 
                    ('skip', 'passthrough', to_skip)])
    return scaler

#### 3.2 Preliminary Results

In [19]:
skips = ['Complains', 'Tariff Plan', 'Status']
uses = [c for c in X_train.columns if c not in skips]
uses

['Call  Failure',
 'Subscription  Length',
 'Charge  Amount',
 'Seconds of Use',
 'Frequency of use',
 'Frequency of SMS',
 'Distinct Called Numbers',
 'Age Group',
 'Customer Value']

In [20]:
base_scaler = customize_scaler(uses, skips)
base_scaler

In [21]:
vanilla_dict = {}
vanilla_dict['data'] = [X_train, y_train]
vanilla_dict['class'] = LogisticRegression
vanilla_dict['param'] = {}
vanilla_dict['drop'] = []
vanilla_dict['scaler'] = base_scaler

In [22]:
wrapper = {'vanilla': vanilla_dict}
vanilla_lr = run_lr(wrapper)

0.041 seconds


In [23]:
vanilla_lr['vanilla']

Unnamed: 0,Metric,Average
0,accuracy,0.891
1,precision,0.756
2,recall,0.455
3,f1_score,0.562


#### 3.3 Feature Importance

In [24]:
import statsmodels.api as sm

In [25]:
def show_coef(data=[X_train, y_train], scaler=base_scaler):
    
    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    lr_coef, stat_coef = [], []
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set
        
        X_trn = scaler.fit_transform(X_trn)  
        X_val = scaler.transform(X_val) 
        
        # sklearn
        lr = LogisticRegression(penalty='none')
        lr.fit(X_trn, y_trn)
        lr_coef.append(abs(lr.coef_[0]))

        # stat
        X_trn_with_const = sm.add_constant(X_trn)  # 절편 추가
        stat_lr = sm.Logit(y_trn, X_trn_with_const).fit(disp=0)      
        stat_coef.append(np.abs(stat_lr.params[1:]))

    mean_lr_coef = np.mean(np.array(lr_coef), axis=0)
    mean_stat_coef = np.mean(np.array(stat_coef), axis=0)
    
    # 결과 출력
    coef_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Sklearn': mean_lr_coef,
        'Stat': mean_stat_coef
    })
    
    return coef_df

In [26]:
coef_df = show_coef()

In [27]:
coef_df.sort_values(by='Sklearn', ascending=False)

Unnamed: 0,Feature,Sklearn,Stat
5,Frequency of use,5.828212,5.827944
8,Age Group,4.692406,4.692001
9,Tariff Plan,3.860397,3.860348
4,Seconds of Use,3.487916,3.487936
11,Customer Value,1.428817,1.428827
0,Call Failure,0.904517,0.904521
2,Subscription Length,0.58985,0.58988
10,Status,0.507591,0.507764
3,Charge Amount,0.413948,0.414167
1,Complains,0.378449,0.378465


In [28]:
drop_candidates = coef_df.sort_values(by='Sklearn', ascending=False)['Feature'][-5:].values.tolist()
drop_candidates

['Status',
 'Charge  Amount',
 'Complains',
 'Frequency of SMS',
 'Distinct Called Numbers']

#### 3.4 Feature Selection Based on CV Performance

In [29]:
drop_dicts = {}

for i in range(1, len(drop_candidates)+1):
    cur_drop = f'drop_{i}cols'
    drop_dicts[cur_drop] = {}
    cur_dict = drop_dicts[cur_drop]
    cur_dict['data'] = [X_train, y_train]
    cur_dict['class'] = LogisticRegression
    cur_dict['param'] = {'penalty':'none'}
    cur_dict['drop'] = drop_candidates[-i:]
    binaries = ['Complains', 'Tariff Plan', 'Status']
    skips = [c for c in binaries if c not in cur_dict['drop']]
    uses = [c for c in X_train.columns if c not in skips and c not in cur_dict['drop']]
    cur_dict['scaler'] = customize_scaler(uses, skips)

In [30]:
drop_result = run_lr(drop_dicts)

0.048 seconds
0.033 seconds
0.029 seconds
0.028 seconds
0.027 seconds


In [31]:
compare_models(drop_result)

Unnamed: 0_level_0,drop_1cols,drop_2cols,drop_3cols,drop_4cols,drop_5cols,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
accuracy,0.888,0.89,0.852,0.839,0.844,drop_2cols
precision,0.723,0.755,0.551,0.484,0.518,drop_2cols
recall,0.465,0.449,0.368,0.304,0.184,drop_1cols
f1_score,0.561,0.556,0.438,0.369,0.265,drop_1cols


In [32]:
drop_l2_dicts = {}

for i in range(0, 3):
    cur_drop = f'drop_{i}cols'
    drop_l2_dicts[cur_drop] = {}
    cur_dict = drop_l2_dicts[cur_drop]
    cur_dict['data'] = [X_train, y_train]
    cur_dict['class'] = LogisticRegression
    cur_dict['param'] = {}
    if i != 0:
        cur_dict['drop'] = drop_candidates[-i:]
    else:
        cur_dict['drop'] = []
    binaries = ['Complains', 'Tariff Plan', 'Status']
    skips = [c for c in binaries if c not in cur_dict['drop']]
    uses = [c for c in X_train.columns if c not in skips and c not in cur_dict['drop']]
    cur_dict['scaler'] = customize_scaler(uses, skips)

In [33]:
drop_l2_result = run_lr(drop_l2_dicts)

0.040 seconds
0.035 seconds
0.029 seconds


In [34]:
compare_models(drop_l2_result)

Unnamed: 0_level_0,drop_0cols,drop_1cols,drop_2cols,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,0.891,0.889,0.889,drop_0cols
precision,0.756,0.755,0.756,drop_0cols
recall,0.455,0.443,0.443,drop_0cols
f1_score,0.562,0.55,0.551,drop_0cols


### 4. PCA Feature Generation
	• EDA identified variables with linear relationships.
	• Applied PCA to these variables and combined the resulting feature with the original dataset.

In [35]:
from sklearn.decomposition import PCA

In [36]:
def apply_PCA(data, num_comp):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    pca = PCA(n_components=num_comp)
    pcs = pca.fit_transform(scaled_data)
    cols = [f'PC{i}' for i in range(1, num_comp+1)]
    pca_df = pd.DataFrame(data=pcs, columns=cols)
    return pca_df

In [37]:
corr_cols = ['Seconds of Use', 'Frequency of use', 'Frequency of SMS', 'Customer Value']
pca_cols = apply_PCA(X_train[corr_cols], 2)
other_cols = X_train[[col for col in X_train.columns if col not in corr_cols]]

PCA_X_train = pd.concat([pca_cols, other_cols], axis=1)
PCA_X_train.head()

Unnamed: 0,PC1,PC2,Call Failure,Complains,Subscription Length,Charge Amount,Distinct Called Numbers,Age Group,Tariff Plan,Status
0,1.278962,-0.564671,18,0,22,2,43,2,2,1
1,-0.739324,-0.43372,5,0,22,0,24,3,1,1
2,0.166651,1.006538,6,0,35,0,19,2,1,1
3,-1.758256,0.155644,0,0,22,0,11,2,1,1
4,1.103067,0.071642,14,0,38,2,37,2,1,1


In [38]:
pca_wrapper = {}
pca_wrapper['pca'] = {}
cur_dict = pca_wrapper['pca'] 
cur_dict['data'] = [PCA_X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {}
binaries = ['Complains', 'Tariff Plan', 'Status']
skips = binaries + pca_cols.columns.tolist()
uses = [c for c in PCA_X_train.columns if c not in skips]
cur_dict['scaler'] = customize_scaler(uses, skips)

In [39]:
cur_dict['scaler']

In [40]:
pca_result = run_lr(pca_wrapper)

0.032 seconds


In [41]:
pca_result['pca']

Unnamed: 0,Metric,Average
0,accuracy,0.89
1,precision,0.759
2,recall,0.452
3,f1_score,0.56


### 4. Spline Transformation

In [42]:
from sklearn.preprocessing import SplineTransformer

In [43]:
def spline_cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set

        # spline & scaler transformation
        X_trn = scaler.fit_transform(X_trn) 
        X_val = scaler.transform(X_val) 

        # 원래 변환 전 칼럼을 드랍해야 함 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [44]:
def run_spline(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = spline_cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [45]:
binaries = ['Complains', 'Tariff Plan', 'Status']
discretes = [c for c in X_train.columns if c not in binaries and c not in continuous_features]

In [46]:
from sklearn.pipeline import Pipeline
# spline + scaler
spline_scaler_pipeline = Pipeline([
    ('spline', SplineTransformer(degree=3, n_knots=4)),  # Spline 변환
    ('scaler', StandardScaler()) 
])

spline_scaler = ColumnTransformer(
                transformers=[('both', spline_scaler_pipeline, continuous_features), 
                              ('scaler_only', StandardScaler(), discretes),
                              ('skip', 'passthrough', binaries)])

In [47]:
spline_scaler

In [48]:
sample_X = X_train.sample(10, random_state=42)
sample_X.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Customer Value
1198,16,0,42,2,4735,80,17,21,3,1,1,260.6
526,2,0,36,0,6008,64,83,37,2,1,1,646.74
393,8,0,39,0,2583,58,7,31,1,1,1,183.755
1407,2,0,34,0,1773,37,18,26,3,1,2,144.4
433,29,0,36,3,15365,229,51,30,3,1,1,827.76


In [49]:
spline_sample_X = spline_scaler.fit_transform(sample_X)

In [50]:
sample_X.shape

(10, 12)

In [51]:
len(continuous_features)

7

In [52]:
spline_sample_X.shape

(10, 47)

In [53]:
pd.DataFrame(spline_sample_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,-0.83872,-1.383064,0.236899,2.073913,-0.137712,-0.333333,-0.450107,-0.681706,-1.395867,-0.52735,...,0.587494,0.108618,-0.679281,-0.337108,-0.333333,0.968496,0.267261,0.0,1.0,1.0
1,0.457527,0.911885,-0.26494,-0.663937,-0.37432,-0.333333,-0.450107,-0.507118,0.97316,0.519272,...,-0.687903,1.518582,0.167351,-0.337108,-0.333333,-0.792406,-1.069045,0.0,1.0,1.0
2,-0.825403,-0.429048,1.679742,-0.214737,-0.37432,-0.333333,-0.450107,-0.681706,-1.008643,1.669504,...,0.754919,-0.240256,-0.708318,-0.337108,-0.333333,-0.792406,-2.405351,0.0,1.0,1.0
3,0.457527,0.911885,-0.26494,-0.663937,-0.37432,-0.333333,-0.430653,0.420283,1.304612,-0.879737,...,0.816425,-0.410671,-0.71482,-0.337108,-0.333333,-0.792406,0.267261,0.0,1.0,2.0
4,-0.83872,-1.4083,-1.849156,0.134014,2.990974,3.0,-0.450107,-0.507118,0.97316,0.519272,...,-1.163526,1.504878,1.227545,-0.334892,-0.333333,1.848947,0.267261,0.0,1.0,1.0
5,-0.54638,0.321139,0.854016,-0.559658,-0.37432,-0.333333,-0.450107,-0.681706,-1.395867,-0.52735,...,-1.552705,-1.567671,0.60989,2.999866,3.0,0.968496,0.267261,0.0,1.0,1.0
6,-0.83872,-1.352857,0.613278,1.796133,-0.233024,-0.333333,-0.450107,-0.64913,0.260859,1.30424,...,-1.375219,1.142532,2.252076,-0.305218,-0.333333,0.968496,0.267261,0.0,1.0,1.0
7,1.759635,1.05361,-0.929458,-0.671067,-0.37432,-0.333333,0.794921,1.767101,0.004302,-1.290659,...,0.861142,-0.583779,-0.717766,-0.337108,-0.333333,-0.792406,0.267261,0.0,1.0,1.0
8,-0.54638,0.321139,0.854016,-0.559658,-0.37432,-0.333333,-0.450107,-0.507118,0.97316,0.519272,...,0.881831,-0.778402,-0.718366,-0.337108,-0.333333,-0.792406,1.603567,0.0,1.0,2.0
9,1.759635,1.05361,-0.929458,-0.671067,-0.37432,-0.333333,2.786479,2.028219,-0.688877,-1.306464,...,0.877544,-0.693832,-0.718311,-0.337108,-0.333333,-0.792406,0.267261,0.0,1.0,1.0


In [54]:
spline_wrapper = {}
spline_wrapper['spline'] = {}
cur_dict = spline_wrapper['spline'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'max_iter': 300}
cur_dict['scaler'] = spline_scaler

In [55]:
spline_result = run_spline(spline_wrapper)

0.096 seconds


In [56]:
spline_result['spline']

Unnamed: 0,Metric,Average
0,accuracy,0.92
1,precision,0.803
2,recall,0.66
3,f1_score,0.72


### 5. Penalty and Solver Comparison

In [116]:
comp_wrapper = {}

In [117]:
comp_wrapper['saga_spline'] = {}    
cur_dict = comp_wrapper['saga_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [118]:
comp_wrapper['l1_saga_spline'] = {}    
cur_dict = comp_wrapper['l1_saga_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'penalty':'l1', 'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [119]:
comp_results = run_spline(comp_wrapper) # l1_saga_fold3_convergence warning

4.634 seconds
4.726 seconds


In [120]:
compare_models(comp_results)

Unnamed: 0_level_0,saga_spline,l1_saga_spline,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accuracy,0.921,0.92,saga_spline
precision,0.806,0.804,saga_spline
recall,0.66,0.659,saga_spline
f1_score,0.722,0.721,saga_spline


### 5. Handling Class Imbalance

#### 5.1 SMOTE

In [62]:
# Pipeline-Wrapped Code for Reference
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import cross_validate as sk_cv

pipe = Pipeline(steps = [('smote', SMOTENC(categorical_features, random_state=42)), 
                      ('base_scaler', base_scaler),
                      ('logisticregression', LogisticRegression())])
pipe.fit(X_train, y_train)

# cross validation using intra-fold sampling
sk_cv(pipe, X_train, y_train)

{'fit_time': array([0.05360794, 0.05216193, 0.05309272, 0.05246997, 0.05350494]),
 'score_time': array([0.00110531, 0.0019989 , 0.00121713, 0.00101495, 0.00100613]),
 'test_score': array([0.83168317, 0.82630273, 0.88337469, 0.85111663, 0.85111663])}

In [63]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [64]:
np.mean(np.array([0.83168317, 0.82630273, 0.88337469, 0.85111663, 0.85111663]))

0.84871877

In [65]:
def smote_cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set

        # smote applying on only training set
        smote_nc = SMOTENC(categorical_features, random_state=42)
        X_trn, y_trn = smote_nc.fit_resample(X_trn, y_trn)
        
        # scaling
        # previous parameters not retained
        X_trn = scaler.fit_transform(X_trn) 
        X_val = scaler.transform(X_val) 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [66]:
def run_smote(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = smote_cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [81]:
smote_wrapper = {'smote': {}}
cur_dict = smote_wrapper['smote']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['drop'] = []
cur_dict['scaler'] = base_scaler 

In [82]:
smote_wrapper['smote_spline'] = {}
cur_dict = smote_wrapper['smote_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['drop'] = []
cur_dict['scaler'] = spline_scaler

In [83]:
smote_result = run_smote(smote_wrapper) 
compare_models(smote_result)

0.442 seconds
5.992 seconds


Unnamed: 0_level_0,smote,smote_spline,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accuracy,0.844,0.871,smote_spline
precision,0.509,0.561,smote_spline
recall,0.852,0.87,smote_spline
f1_score,0.634,0.681,smote_spline


#### 5.2 Model Weight Adjustment

In [70]:
weight_wrapper = {}
weight_wrapper['weight'] = {}
cur_dict = weight_wrapper['weight'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000, 'class_weight':'balanced'}
binaries = ['Complains', 'Tariff Plan', 'Status']
cur_dict['scaler'] = base_scaler

In [71]:
weight_smote_wrapper = {}
weight_smote_wrapper['smote'] = {}
cur_dict = weight_smote_wrapper['smote'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000, 'class_weight':'balanced'}
binaries = ['Complains', 'Tariff Plan', 'Status']
cur_dict['scaler'] = base_scaler

In [72]:
weight_only = run_lr(weight_wrapper)
weight_only['weight']

0.105 seconds


Unnamed: 0,Metric,Average
0,accuracy,0.844
1,precision,0.506
2,recall,0.888
3,f1_score,0.643


In [76]:
weight_smote = run_smote(weight_smote_wrapper)
weight_smote['smote']

0.454 seconds


Unnamed: 0,Metric,Average
0,accuracy,0.844
1,precision,0.509
2,recall,0.852
3,f1_score,0.634


### 6. Final Results 

#### 6.1 Summary of CV Results

In [85]:
drops = compare_models(drop_l2_result).drop(columns=['best_model'])
drops

Unnamed: 0_level_0,drop_0cols,drop_1cols,drop_2cols
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
accuracy,0.891,0.889,0.889
precision,0.756,0.755,0.756
recall,0.455,0.443,0.443
f1_score,0.562,0.55,0.551


In [121]:
model_adj = compare_models(comp_results).drop(columns=['best_model'])
model_adj

Unnamed: 0_level_0,saga_spline,l1_saga_spline
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.921,0.92
precision,0.806,0.804
recall,0.66,0.659
f1_score,0.722,0.721


In [87]:
smotes = compare_models(smote_result).drop(columns=['best_model'])
smotes

Unnamed: 0_level_0,smote,smote_spline
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1
accuracy,0.844,0.871
precision,0.509,0.561
recall,0.852,0.87
f1_score,0.634,0.681


In [96]:
df_dict = {'vanilla': vanilla_lr['vanilla'], 
          'pca': pca_result['pca'],
          'spline': spline_result['spline'], 
         'weight': weight_only['weight'], 'smote_weight': weight_smote['smote']}

In [97]:
merged_df = pd.DataFrame()
for key, df in df_dict.items():
    df_renamed = df.rename(columns={'Average': key})
    if merged_df.empty:
        merged_df = df_renamed
    else:
        merged_df = pd.merge(merged_df, df_renamed, on='Metric')

In [98]:
merged_df

Unnamed: 0,Metric,vanilla,pca,spline,weight,smote_weight
0,accuracy,0.891,0.89,0.92,0.844,0.844
1,precision,0.756,0.759,0.803,0.506,0.509
2,recall,0.455,0.452,0.66,0.888,0.852
3,f1_score,0.562,0.56,0.72,0.643,0.634


In [91]:
merged_df

Unnamed: 0,Metric,Average
0,accuracy,0.891
1,precision,0.756
2,recall,0.455
3,f1_score,0.562


In [122]:
from functools import reduce

dfs = [merged_df, drops, model_adj, smotes]  # 합치려는 데이터프레임 리스트
merged_df_final = reduce(lambda left, right: pd.merge(left, right, on='Metric'), dfs)

In [123]:
merged_df_final.set_index('Metric', inplace=True)
merged_df_final['best_model'] = merged_df_final.idxmax(axis=1)

In [124]:
merged_df_final

Unnamed: 0_level_0,vanilla,pca,spline,weight,smote_weight,drop_0cols,drop_1cols,drop_2cols,saga_spline,l1_saga_spline,smote,smote_spline,best_model
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
accuracy,0.891,0.89,0.92,0.844,0.844,0.891,0.889,0.889,0.921,0.92,0.844,0.871,saga_spline
precision,0.756,0.759,0.803,0.506,0.509,0.756,0.755,0.756,0.806,0.804,0.509,0.561,saga_spline
recall,0.455,0.452,0.66,0.888,0.852,0.455,0.443,0.443,0.66,0.659,0.852,0.87,weight
f1_score,0.562,0.56,0.72,0.643,0.634,0.562,0.55,0.551,0.722,0.721,0.634,0.681,saga_spline


#### 6.2 Test Set Logistic Regression Results

In [127]:
final_wrapper = {'saga_spline': {}}
cur_dict = final_wrapper['saga_spline']
cur_dict['data'] = [X_train, y_train, X_test, y_test]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [132]:
def train(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    [X_train, y_train, X_test, y_test] = data
    
    start_time = time.time()
    
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)

    model = model_class(**model_params)
    model.fit(X_train, y_train)
        
    y_pred = model.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)
    final_result = matrix_to_metrics(conf_mat)
    for i, val in enumerate(final_result):
        result_dict[metrics[i]].append(round(final_result[i], 4))
        
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    return result_dict

In [135]:
def run(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = train(data, model_class, model_params, drops, scaler)
        result_dict[name] = single_result 
    return result_dict

In [136]:
final_result = run(final_wrapper)['saga_spline']

1.173 seconds


In [138]:
pd.DataFrame(final_result)

Unnamed: 0,accuracy,precision,recall,f1_score
0,0.9381,0.9167,0.6667,0.7719
