In [None]:
import os
import gc
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.size'] = 14
import seaborn as sns
from collections import Counter
from tqdm import tqdm, trange

from skopt import BayesSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

import catboost as cb

import warnings
warnings.filterwarnings("ignore")

# Load Dataset

In [None]:
path = "data/WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [None]:
df = pd.read_csv(path, index_col=0, low_memory=False)
print(f"There are {df.shape[0]} samples and {df.shape[1]} features in the dataset.")
df.head(5)

# Exploratory Data Analysis

In [None]:
df.info()

## Missing Values

It is worth noting that the data type of `TotalCharges` feature is `object` instead of `float64`. This is because there are some missing values in this feature. We will deal with this later.

In [None]:
df['TotalCharges'] = df.TotalCharges.map(lambda x: float(x) if x != ' ' else np.nan)

In [None]:
df.isnull().sum()

There are 11 samples with missing values in `TotalCharges` feature, with a percentage of $0.16\%$. So we can safely drop these samples.

In [None]:
df.dropna(inplace=True)

## Data Types and Transformations

In [None]:
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges'] # 数值型特征
cat_cols = df.columns.drop(num_cols + ['Churn']) # 类别型特征，不包括label
print(f"Numberical features: {num_cols}")
print(f"Categorical features: {cat_cols.tolist()}")

In [None]:
binary_cats = [cat for cat in cat_cols if df[cat].nunique() == 2] # 二值类别型特征
multi_cats = cat_cols.drop(binary_cats) # 多值类别型特征
print(f"Binary categorical features: {binary_cats}")
print(f"Multi-value categorical features: {multi_cats.tolist()}")

In [None]:
ore = OrdinalEncoder()
binary_cat_data = ore.fit_transform(df[binary_cats])
binary_cat_data = pd.DataFrame(binary_cat_data, columns=binary_cats, index=df.index)
binary_cat_data.head(5)

In [None]:
ohe = OneHotEncoder()
multi_cat_data = ohe.fit_transform(df[multi_cats]).toarray()
multi_cat_data = pd.DataFrame(multi_cat_data, columns=ohe.get_feature_names_out(), index=df.index)
multi_cat_data.head(5)

In [None]:
st = StandardScaler()
num_data = st.fit_transform(df[num_cols])
num_data = pd.DataFrame(num_data, columns=num_cols, index=df.index)
num_data.head(5)

In [None]:
X_new = pd.concat([binary_cat_data, multi_cat_data, num_data], axis=1)
X_new.head(5)

In [None]:
y = df['Churn'].map(lambda x: 1 if x == 'Yes' else 0)
# pie chart
plt.figure(figsize=(6, 6))
plt.pie(y.value_counts(), labels=['No', 'Yes'], autopct='%1.2f%%', shadow=True, startangle=90)
plt.title('Distribution of Churn')
plt.show()

## Feature Selection

### Spearman Correlation

In [None]:
corr = pd.concat([X_new, y], axis=1).corr(method='spearman')
spearman_corr =  corr.Churn.sort_values(ascending=False)[1:]
spearman_corr = spearman_corr[abs(spearman_corr) > 0.1]
plt.figure(figsize=(10, 10))
sns.barplot(x=spearman_corr.values, y=spearman_corr.index)
plt.title("Spearman correlation coefficient of features", fontsize=15)
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.show()

### Family-wise Error Rate

In [None]:
from sklearn.feature_selection import SelectFwe

In [None]:
selector = SelectFwe()
selector = selector.fit(X_new, y)
selected_features = X_new.columns[selector.get_support()].tolist()
print(f"Optimal number of features: {len(selected_features)}")
print(f"Selected features: {selected_features}")

# Classification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new[selected_features], y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Testing set: {X_test.shape[0]} samples, {X_test.shape[1]} features")

In [None]:
res = {}

## Logistic Regression

In [None]:
res['LogisticRegression'] = {}

In [None]:
# before optimization

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
res['LogisticRegression']['base'] = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

In [None]:
lr_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga', 'newton-cholesky']
}

lr_grid = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)
lr_grid.fit(X_train, y_train)
print(f"Best parameters: {lr_grid.best_params_}")
print(f"Best cross-validation score: {lr_grid.best_score_}")
y_pred = lr_grid.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
res['LogisticRegression']['optimization'] = accuracy_score(y_test, y_pred)

## Decision Tree

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
res['DecisionTreeClassifier'] = {}
res['DecisionTreeClassifier']['base'] = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

In [None]:
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(3, 17, 2),
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['auto', 'sqrt', 'log2']
}

dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)
dt_grid.fit(X_train, y_train)
print(f"Best parameters: {dt_grid.best_params_}")
print(f"Best cross-validation score: {dt_grid.best_score_}")
y_pred = dt_grid.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
res['DecisionTreeClassifier']['optimization'] = accuracy_score(y_test, y_pred)

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
clf = xgb.XGBClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
res['XGBClassifier'] = {}
res['XGBClassifier']['base'] = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

In [None]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 0,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 200,
        'seed': 0
    }

In [None]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 200,
                        trials = trials)

In [None]:
best_hyperparams

In [None]:
model = xgb.XGBClassifier(
    n_estimators =200, max_depth = int(best_hyperparams['max_depth']), gamma = best_hyperparams['gamma'],
    reg_alpha = int(best_hyperparams['reg_alpha']),min_child_weight=int(best_hyperparams['min_child_weight']),
    colsample_bytree=int(best_hyperparams['colsample_bytree']))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
res['XGBClassifier']['optimization'] = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")