In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
import json

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.drop(columns=['customerID'], inplace=True)

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [3]:
replace_maps = {
    'YesNo': {'Yes': 1, 'No': 0},
    'MaleFemale': {'Male': 1, 'Female': 0},
    'Contract': {'Month-to-month': 0, 'One year': 1, 'Two year': 2},
    'MultipleLines': {'No phone service': -1, 'No': 0, 'Yes': 1},
    'PaymentMethod': {
        'Electronic check': 0,
        'Mailed check': 1,
        'Bank transfer (automatic)': 2,
        'Credit card (automatic)': 3
    },
    'InternetService': {'DSL': 1, 'Fiber optic': 2, 'No': 0},
    'InternetServiceRelated': {'No internet service': -1, 'No': 0, 'Yes': 1}
}

internet_cols = [
    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies'
]


In [4]:
def replace_categories(df):
    yes_no_cols = [col for col in df.select_dtypes(include='object').columns if df[col].isin(['Yes', 'No']).any()]
    df[yes_no_cols] = df[yes_no_cols].replace(replace_maps['YesNo'])

    mf_cols = [col for col in df.select_dtypes(include='object').columns if df[col].isin(['Male', 'Female']).any()]
    df[mf_cols] = df[mf_cols].replace(replace_maps['MaleFemale'])

    for col in ['Contract', 'MultipleLines', 'PaymentMethod', 'InternetService']:
        if col in df.columns:
            df[col] = df[col].replace(replace_maps[col])

    df[internet_cols] = df[internet_cols].replace(replace_maps['InternetServiceRelated'])
    
    return df

df = replace_categories(df)


  df[yes_no_cols] = df[yes_no_cols].replace(replace_maps['YesNo'])
  df[mf_cols] = df[mf_cols].replace(replace_maps['MaleFemale'])
  df[col] = df[col].replace(replace_maps[col])
  df[internet_cols] = df[internet_cols].replace(replace_maps['InternetServiceRelated'])


In [5]:
X = df.drop('Churn', axis=1).values
y = df['Churn'].values


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 64,
    'max_depth': 10,
    'min_data_in_leaf': 30,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.5,
    'lambda_l2': 0.5,
    'verbose': -1,
    'is_unbalance': True
}


In [8]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid)

model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)]
)

y_pred = model.predict(X_valid)
y_pred_labels = (y_pred > 0.5).astype(int)
print(f"LightGBM accuracy: {accuracy_score(y_valid, y_pred_labels):.4f}")


Training until validation scores don't improve for 50 rounds
[50]	valid_0's binary_logloss: 0.485162
[100]	valid_0's binary_logloss: 0.461384
[150]	valid_0's binary_logloss: 0.457068
[200]	valid_0's binary_logloss: 0.458616
Early stopping, best iteration is:
[150]	valid_0's binary_logloss: 0.457068
LightGBM accuracy: 0.7764


In [9]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, valid_idx in folds.split(X, y):
    train_data = lgb.Dataset(X[train_idx], label=y[train_idx])
    valid_data = lgb.Dataset(X[valid_idx], label=y[valid_idx])
    
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)]
    )
    
    y_pred = model.predict(X[valid_idx])
    y_pred_labels = (y_pred > 0.5).astype(int)
    scores.append(accuracy_score(y[valid_idx], y_pred_labels))

print(f"CV accuracy scores: {scores}")
print(f"Mean CV accuracy: {np.mean(scores):.4f}")


Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.452319
[200]	valid_0's binary_logloss: 0.4446
Early stopping, best iteration is:
[185]	valid_0's binary_logloss: 0.444119
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.467965
Early stopping, best iteration is:
[134]	valid_0's binary_logloss: 0.465295
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.455624
[200]	valid_0's binary_logloss: 0.449154
Early stopping, best iteration is:
[184]	valid_0's binary_logloss: 0.448487
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.471171
Early stopping, best iteration is:
[128]	valid_0's binary_logloss: 0.468596
Training until validation scores don't improve for 50 rounds
[100]	valid_0's binary_logloss: 0.460081
[200]	valid_0's binary_logloss: 0.455478
Early stopping, best iteration is:
[155]	valid_0's binary_logloss

In [10]:
def save_preprocessing_config(config, filepath):
    with open(filepath, 'w') as f:
        json.dump(config, f, indent=4)

save_preprocessing_config({'mappings': replace_maps}, 'preprocessing_config.json')


In [11]:
model.save_model('model.txt')

<lightgbm.basic.Booster at 0x210c3a86fb0>