### 1. Importing Packages and Loading Data

In [None]:
import pandas as pd
from zipfile import ZipFile
import os
 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import math

In [None]:
file_path = './iranian+churn+dataset.zip'

with ZipFile(file_path) as z:
    print(z.namelist())

In [None]:
filename = z.namelist()[0]
with ZipFile(file_path).open(filename) as f:
    df = pd.read_csv(f)

In [None]:
df.head()

In [None]:
df.drop(columns=['Age'], inplace=True) # Drop duplicate feature

### 2. Splitting Data into Train and Test Sets

In [None]:
from sklearn.model_selection import train_test_split

# Use stratified split
X = df.drop(columns = ['Churn']) # dataframe
y = df['Churn'] # series

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=312, stratify=y)

# Reset Index
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print(f'# of train samples: {len(X_train)}')
print(f'# of test samples: {len(X_test)}')

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
categorical_features = [c for c in X_train.columns if X_train[c].nunique() < 12]
categorical_features

In [None]:
continuous_features = [c for c in X_train.columns if c not in categorical_features]
continuous_features

### 3. Baseline Logistic Regression and Feature Importance

#### 3.1 Setting Custom Cross-Validation Methods

In [None]:
metric_list = ['accuracy', 'precision', 'recall', 'f1_score']

def matrix_to_metrics(mat):
    TP = mat[1, 1]
    TN = mat[0, 0]
    FP = mat[0, 1]
    FN = mat[1, 0]
    
    acc = (TP + TN) / (TP + TN + FP + FN)
    prec = TP / (TP + FP) 
    rec = TP / (TP + FN)
    f1_score = 2 * (prec * rec) / (prec + rec)
    
    return [acc, prec, rec, f1_score]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import time
import datetime
from sklearn.metrics import confusion_matrix

In [None]:
def cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set
        
        # scaling
        if scaler is not None:   
            # previous parameters not retained
            X_trn = scaler.fit_transform(X_trn)  
            X_val = scaler.transform(X_val) 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [None]:
def get_summary(result_dict):
    cols = []
    avgs = []
    for key in result_dict:
        cols.append(key)
        avgs.append(result_dict[key][-1]['average'])
    return pd.DataFrame({'Metric': cols, 'Average': avgs}) 

In [None]:
def compare_models(df_dict): 
    n = len(df_dict)
    for idx, item in enumerate(df_dict.items()):
        model_name, result = item
        if idx == 0:
            merged_df = result.rename(columns={'Average': model_name})
        else:
            merged_df = pd.merge(merged_df, result.rename(columns={'Average': model_name}), on='Metric')

    merged_df = merged_df.set_index('Metric')
    merged_df['best_model'] = merged_df.idxmax(axis=1)
    return merged_df

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
def run_lr(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [None]:
from sklearn.compose import ColumnTransformer

def customize_scaler(to_scale, to_skip):
    scaler = ColumnTransformer(
                transformers=[('process', StandardScaler(), to_scale), 
                    ('skip', 'passthrough', to_skip)])
    return scaler

#### 3.2 Preliminary Results

In [None]:
skips = ['Complains', 'Tariff Plan', 'Status']
uses = [c for c in X_train.columns if c not in skips]
uses

In [None]:
base_scaler = customize_scaler(uses, skips)
base_scaler

In [None]:
vanilla_dict = {}
vanilla_dict['data'] = [X_train, y_train]
vanilla_dict['class'] = LogisticRegression
vanilla_dict['param'] = {}
vanilla_dict['drop'] = []
vanilla_dict['scaler'] = base_scaler

In [None]:
wrapper = {'vanilla': vanilla_dict}
vanilla_lr = run_lr(wrapper)

In [None]:
vanilla_lr['vanilla']

#### 3.3 Feature Importance

In [None]:
import statsmodels.api as sm

In [None]:
def show_coef(data=[X_train, y_train], scaler=base_scaler):
    
    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    lr_coef, stat_coef = [], []
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set
        
        X_trn = scaler.fit_transform(X_trn)  
        X_val = scaler.transform(X_val) 
        
        # sklearn
        lr = LogisticRegression(penalty='none')
        lr.fit(X_trn, y_trn)
        lr_coef.append(abs(lr.coef_[0]))

        # stat
        X_trn_with_const = sm.add_constant(X_trn)  # 절편 추가
        stat_lr = sm.Logit(y_trn, X_trn_with_const).fit(disp=0)      
        stat_coef.append(np.abs(stat_lr.params[1:]))

    mean_lr_coef = np.mean(np.array(lr_coef), axis=0)
    mean_stat_coef = np.mean(np.array(stat_coef), axis=0)
    
    # 결과 출력
    coef_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Sklearn': mean_lr_coef,
        'Stat': mean_stat_coef
    })
    
    return coef_df

In [None]:
coef_df = show_coef()

In [None]:
coef_df.sort_values(by='Sklearn', ascending=False)

In [None]:
drop_candidates = coef_df.sort_values(by='Sklearn', ascending=False)['Feature'][-5:].values.tolist()
drop_candidates

#### 3.4 Feature Selection Based on CV Performance

In [None]:
drop_dicts = {}

for i in range(1, len(drop_candidates)+1):
    cur_drop = f'drop_{i}cols'
    drop_dicts[cur_drop] = {}
    cur_dict = drop_dicts[cur_drop]
    cur_dict['data'] = [X_train, y_train]
    cur_dict['class'] = LogisticRegression
    cur_dict['param'] = {'penalty':'none'}
    cur_dict['drop'] = drop_candidates[-i:]
    binaries = ['Complains', 'Tariff Plan', 'Status']
    skips = [c for c in binaries if c not in cur_dict['drop']]
    uses = [c for c in X_train.columns if c not in skips and c not in cur_dict['drop']]
    cur_dict['scaler'] = customize_scaler(uses, skips)

In [None]:
drop_result = run_lr(drop_dicts)

In [None]:
compare_models(drop_result)

In [None]:
drop_l2_dicts = {}

for i in range(0, 3):
    cur_drop = f'drop_{i}cols'
    drop_l2_dicts[cur_drop] = {}
    cur_dict = drop_l2_dicts[cur_drop]
    cur_dict['data'] = [X_train, y_train]
    cur_dict['class'] = LogisticRegression
    cur_dict['param'] = {}
    if i != 0:
        cur_dict['drop'] = drop_candidates[-i:]
    else:
        cur_dict['drop'] = []
    binaries = ['Complains', 'Tariff Plan', 'Status']
    skips = [c for c in binaries if c not in cur_dict['drop']]
    uses = [c for c in X_train.columns if c not in skips and c not in cur_dict['drop']]
    cur_dict['scaler'] = customize_scaler(uses, skips)

In [None]:
drop_l2_result = run_lr(drop_l2_dicts)

In [None]:
compare_models(drop_l2_result)

### 4. PCA Feature Generation
	• EDA identified variables with linear relationships.
	• Applied PCA to these variables and combined the resulting feature with the original dataset.

In [None]:
from sklearn.decomposition import PCA

In [None]:
def apply_PCA(data, num_comp):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    pca = PCA(n_components=num_comp)
    pcs = pca.fit_transform(scaled_data)
    cols = [f'PC{i}' for i in range(1, num_comp+1)]
    pca_df = pd.DataFrame(data=pcs, columns=cols)
    return pca_df

In [None]:
corr_cols = ['Seconds of Use', 'Frequency of use', 'Frequency of SMS', 'Customer Value']
pca_cols = apply_PCA(X_train[corr_cols], 2)
other_cols = X_train[[col for col in X_train.columns if col not in corr_cols]]

PCA_X_train = pd.concat([pca_cols, other_cols], axis=1)
PCA_X_train.head()

In [None]:
pca_wrapper = {}
pca_wrapper['pca'] = {}
cur_dict = pca_wrapper['pca'] 
cur_dict['data'] = [PCA_X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {}
binaries = ['Complains', 'Tariff Plan', 'Status']
skips = binaries + pca_cols.columns.tolist()
uses = [c for c in PCA_X_train.columns if c not in skips]
cur_dict['scaler'] = customize_scaler(uses, skips)

In [None]:
cur_dict['scaler']

In [None]:
pca_result = run_lr(pca_wrapper)

In [None]:
pca_result['pca']

### 4. Spline Transformation

In [None]:
from sklearn.preprocessing import SplineTransformer

In [None]:
def spline_cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set

        # spline & scaler transformation
        X_trn = scaler.fit_transform(X_trn) 
        X_val = scaler.transform(X_val) 

        # 원래 변환 전 칼럼을 드랍해야 함 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [None]:
def run_spline(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = spline_cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [None]:
binaries = ['Complains', 'Tariff Plan', 'Status']
discretes = [c for c in X_train.columns if c not in binaries and c not in continuous_features]

In [None]:
from sklearn.pipeline import Pipeline
# spline + scaler
spline_scaler_pipeline = Pipeline([
    ('spline', SplineTransformer(degree=3, n_knots=4)),  # Spline 변환
    ('scaler', StandardScaler()) 
])

spline_scaler = ColumnTransformer(
                transformers=[('both', spline_scaler_pipeline, continuous_features), 
                              ('scaler_only', StandardScaler(), discretes),
                              ('skip', 'passthrough', binaries)])

In [None]:
spline_scaler

In [None]:
sample_X = X_train.sample(10, random_state=42)
sample_X.head()

In [None]:
spline_sample_X = spline_scaler.fit_transform(sample_X)

In [None]:
sample_X.shape

In [None]:
len(continuous_features)

In [None]:
spline_sample_X.shape

In [None]:
pd.DataFrame(spline_sample_X)

In [None]:
spline_wrapper = {}
spline_wrapper['spline'] = {}
cur_dict = spline_wrapper['spline'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'max_iter': 300}
cur_dict['scaler'] = spline_scaler

In [None]:
spline_result = run_spline(spline_wrapper)

In [None]:
spline_result['spline']

### 5. Penalty and Solver Comparison

In [None]:
comp_wrapper = {}

In [None]:
comp_wrapper['saga_spline'] = {}    
cur_dict = comp_wrapper['saga_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [None]:
comp_wrapper['l1_saga_spline'] = {}    
cur_dict = comp_wrapper['l1_saga_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'penalty':'l1', 'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [None]:
comp_results = run_spline(comp_wrapper) # l1_saga_fold3_convergence warning

In [None]:
compare_models(comp_results)

### 5. Handling Class Imbalance

#### 5.1 SMOTE

In [None]:
# Pipeline-Wrapped Code for Reference
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import cross_validate as sk_cv

pipe = Pipeline(steps = [('smote', SMOTENC(categorical_features, random_state=42)), 
                      ('base_scaler', base_scaler),
                      ('logisticregression', LogisticRegression())])
pipe.fit(X_train, y_train)

# cross validation using intra-fold sampling
sk_cv(pipe, X_train, y_train)

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [None]:
np.mean(np.array([0.83168317, 0.82630273, 0.88337469, 0.85111663, 0.85111663]))

In [None]:
def smote_cross_validate(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    X_train, y_train = data
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # removing correlated columns
    X_train = X_train.drop(columns=drops, axis=1)

    start_time = time.time()
    
    for fold in kf.split(X_train): 
        
        trn_idx, val_idx = fold
        X_trn, X_val = X_train.iloc[trn_idx], X_train.iloc[val_idx] # analysis set 
        y_trn, y_val = y_train.iloc[trn_idx], y_train.iloc[val_idx] # assessment set

        # smote applying on only training set
        smote_nc = SMOTENC(categorical_features, random_state=42)
        X_trn, y_trn = smote_nc.fit_resample(X_trn, y_trn)
        
        # scaling
        # previous parameters not retained
        X_trn = scaler.fit_transform(X_trn) 
        X_val = scaler.transform(X_val) 

        # model fitting
        model = model_class(**model_params)
        model.fit(X_trn, y_trn)
        
        # model evaluation
        y_pred = model.predict(X_val)
        conf_mat = confusion_matrix(y_val, y_pred)
        fold_result = matrix_to_metrics(conf_mat)
        for i, val in enumerate(fold_result):
            result_dict[metrics[i]].append(round(fold_result[i], 4))
            
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    for key in result_dict:
        result_dict[key].append({'average': round(sum(result_dict[key]) / len(result_dict[key]), 3)})
    
    return result_dict

In [None]:
def run_smote(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = smote_cross_validate(data, model_class, model_params, drops, scaler)
        result_dict[name] = get_summary(single_result)   
    return result_dict

In [None]:
smote_wrapper = {'smote': {}}
cur_dict = smote_wrapper['smote']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['drop'] = []
cur_dict['scaler'] = base_scaler 

In [None]:
smote_wrapper['smote_spline'] = {}
cur_dict = smote_wrapper['smote_spline']
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['drop'] = []
cur_dict['scaler'] = spline_scaler

In [None]:
smote_result = run_smote(smote_wrapper) 
compare_models(smote_result)

#### 5.2 Model Weight Adjustment

In [None]:
weight_wrapper = {}
weight_wrapper['weight'] = {}
cur_dict = weight_wrapper['weight'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000, 'class_weight':'balanced'}
binaries = ['Complains', 'Tariff Plan', 'Status']
cur_dict['scaler'] = base_scaler

In [None]:
weight_smote_wrapper = {}
weight_smote_wrapper['smote'] = {}
cur_dict = weight_smote_wrapper['smote'] 
cur_dict['data'] = [X_train, y_train]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000, 'class_weight':'balanced'}
binaries = ['Complains', 'Tariff Plan', 'Status']
cur_dict['scaler'] = base_scaler

In [None]:
weight_only = run_lr(weight_wrapper)
weight_only['weight']

In [None]:
weight_smote = run_smote(weight_smote_wrapper)
weight_smote['smote']

### 6. Final Results 

#### 6.1 Summary of CV Results

In [None]:
drops = compare_models(drop_l2_result).drop(columns=['best_model'])
drops

In [None]:
model_adj = compare_models(comp_results).drop(columns=['best_model'])
model_adj

In [None]:
smotes = compare_models(smote_result).drop(columns=['best_model'])
smotes

In [None]:
df_dict = {'vanilla': vanilla_lr['vanilla'], 
          'pca': pca_result['pca'],
          'spline': spline_result['spline'], 
         'weight': weight_only['weight'], 'smote_weight': weight_smote['smote']}

In [None]:
merged_df = pd.DataFrame()
for key, df in df_dict.items():
    df_renamed = df.rename(columns={'Average': key})
    if merged_df.empty:
        merged_df = df_renamed
    else:
        merged_df = pd.merge(merged_df, df_renamed, on='Metric')

In [None]:
merged_df

In [None]:
merged_df

In [None]:
from functools import reduce

dfs = [merged_df, drops, model_adj, smotes]  # 합치려는 데이터프레임 리스트
merged_df_final = reduce(lambda left, right: pd.merge(left, right, on='Metric'), dfs)

In [None]:
merged_df_final.set_index('Metric', inplace=True)
merged_df_final['best_model'] = merged_df_final.idxmax(axis=1)

In [None]:
merged_df_final

#### 6.2 Test Set Logistic Regression Results

In [None]:
final_wrapper = {'saga_spline': {}}
cur_dict = final_wrapper['saga_spline']
cur_dict['data'] = [X_train, y_train, X_test, y_test]
cur_dict['class'] = LogisticRegression
cur_dict['drop'] = []
cur_dict['param'] = {'solver':'saga', 'max_iter':5000}
cur_dict['scaler'] = spline_scaler

In [None]:
def train(data, model_class, model_params, drops, scaler):
    result_dict = {} 
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    for m in metrics:
        result_dict[m] = []

    [X_train, y_train, X_test, y_test] = data
    
    start_time = time.time()
    
    X_train = scaler.fit_transform(X_train)  
    X_test = scaler.transform(X_test)

    model = model_class(**model_params)
    model.fit(X_train, y_train)
        
    y_pred = model.predict(X_test)
    conf_mat = confusion_matrix(y_test, y_pred)
    final_result = matrix_to_metrics(conf_mat)
    for i, val in enumerate(final_result):
        result_dict[metrics[i]].append(round(final_result[i], 4))
        
    end_time = time.time() 
    elapsed_time = end_time - start_time
    readable_time = f"{elapsed_time:.3f} seconds"
    print(readable_time)

    return result_dict

In [None]:
def run(model_dict):
    result_dict = {}
    for name in model_dict:
        print(name, '='* (20-len(name)))
        data = model_dict[name]['data']
        model_class = model_dict[name]['class']
        model_params= model_dict[name]['param']
        drops = model_dict[name]['drop']
        scaler = model_dict[name]['scaler']
        single_result = train(data, model_class, model_params, drops, scaler)
        result_dict[name] = single_result 
    return result_dict

In [None]:
final_result = run(final_wrapper)['saga_spline']

In [None]:
pd.DataFrame(final_result)