In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
%matplotlib inline
plt.style.use('seaborn-v0_8-dark-palette')

  from .autonotebook import tqdm as notebook_tqdm


In [62]:
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, roc_auc_score, f1_score
from sklearn.model_selection import GridSearchCV

In [3]:
from catboost import CatBoostClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [4]:
df = pd.read_csv('data.csv')
df.shape

(7043, 21)

In [5]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [8]:
df.duplicated().sum()

0

In [9]:
profile = ProfileReport(df, title="Profiling Report", explorative=True, minimal=True)
#profile.to_file('report.html')
#profile.to_widgets()

In [10]:
df[df['PhoneService'] == 'No']['MultipleLines'].value_counts()

MultipleLines
No phone service    682
Name: count, dtype: int64

In [11]:
df[df['InternetService'] == 'No'][['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']].value_counts()

OnlineSecurity       OnlineBackup         DeviceProtection     TechSupport          StreamingTV          StreamingMovies    
No internet service  No internet service  No internet service  No internet service  No internet service  No internet service    1526
Name: count, dtype: int64

По результатам анализа данных видим, что 
1) Пропусков нет, заполнять ничего не нужно
2) Дубликатов нет, удалять записи не нужно
3) Данные в целом сбалансированы, нет значительногого превышение одного класса над другим. Единственный несблансированный PhoneService, в нем данные - 90/10.  
План действий:
1) Выкинем признак PhoneService, так как его дублирует MultipleLines, только в первом 2 категории а во втором 3, что более показательно, применим к нему OneHotEncoding
2) Признак InternetService дублирует данные из OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies. Предлагается оставить InternetService и применить OneHot, для остальных признаков вместо трех категорий оставить 1(чтобы не дублироваь много раз 'No internet service' и 'No')
3) В таких полей как Partner, Dependents,PaperlessBilling, Churn,  заменим значения Yes и No на 1 и 0
4) TotalCharges типа object, необходимо привести к числовому
5) Проверить корреляцию и распределние числовых данных


In [12]:
df.drop(columns=['PhoneService'], inplace=True)

In [13]:
obj_to_num_features = ['TotalCharges']
yes_no_features = ['Partner', 'Dependents', 'PaperlessBilling']
internet_services = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
num_features = ['tenure', 'MonthlyCharges']

In [14]:
def change_yes_to_1(X: pd.DataFrame, y=None)->pd.DataFrame:
    for c in X.columns:
        X[c] = X.loc[:,c].map({'Yes': 1, 'No': 0})
    return X

In [15]:
def fill_not_numeric(X: pd.DataFrame, y=None)->pd.DataFrame:
    for c in X.columns:
        X[c] = X[c].apply(pd.to_numeric, errors="coerce")
        X[c] = X[c].fillna(X[X[c].notna()][c].mean())

    return X

In [16]:
def change_yes_to_1_for_InternetServices(X: pd.DataFrame, y=None)->pd.DataFrame:
    for c in X.columns:
        X[c] = X.loc[:,c].map({'Yes': 1, 'No': 0, 'No internet service': 0})
    return X

In [17]:
one_hot_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown="ignore"))])

obj_to_num_transformer = Pipeline(steps=[
    ('fill_not_numeric', FunctionTransformer(fill_not_numeric)),
    ('scaler', StandardScaler())
    ])

change_yes_to_1_transformer = Pipeline(steps=[
    ('change_yes_to_1', FunctionTransformer(change_yes_to_1))])

change_yes_to_1_for_InternetServices_transformer = Pipeline(steps=[
    ('change_yes_to_1_for_InternetServices', FunctionTransformer(change_yes_to_1_for_InternetServices))])

num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

data_transformer = ColumnTransformer(transformers=[
    ('obj_to_num_features', obj_to_num_transformer, obj_to_num_features),
    ('yes_no_features', change_yes_to_1_transformer, yes_no_features),
    ('MultipleLines_feature', one_hot_transformer, ['MultipleLines']),
    ('InternetService_feature', one_hot_transformer, ['MultipleLines']),
    ('InternetServices_feature', change_yes_to_1_for_InternetServices_transformer, internet_services),
    ('num_features', num_transformer, num_features)
])

preprocessor = Pipeline(steps=[("data_transformer", data_transformer)])

In [18]:
preprocessor

In [19]:
preprocessor.fit_transform(df).shape

(7043, 18)

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns='Churn'), df['Churn'], test_size=0.8, random_state=42)

In [21]:
Y_train = Y_train.map({'Yes': 1, 'No': 0})
Y_test = Y_test.map({'Yes': 1, 'No': 0})

In [22]:
Y_test.value_counts()

Churn
0    4122
1    1513
Name: count, dtype: int64

In [46]:
sklearn_pipline = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", GradientBoostingClassifier())])
sklearn_pipline.fit(X_train, Y_train)

print(f'roc-auc_train = {roc_auc_score(Y_train, sklearn_pipline.predict_proba(X_train)[:,1])}')
print(f'roc-auc_test = {roc_auc_score(Y_test, sklearn_pipline.predict_proba(X_test)[:,1])}')

roc-auc_train = 0.9265564254282908
roc-auc_test = 0.8161253608945663


In [25]:
xgboost_pipline = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", XGBClassifier())])
xgboost_pipline.fit(X_train, Y_train)

print(f'roc-auc_train = {roc_auc_score(Y_train, xgboost_pipline.predict_proba(X_train)[:,1])}')
print(f'roc-auc_test = {roc_auc_score(Y_test, xgboost_pipline.predict_proba(X_test)[:,1])}')

roc-auc_train = 0.9985728094159867
roc-auc_test = 0.793149328815477


In [26]:
catboost_pipline = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", CatBoostClassifier(logging_level='Silent'))])

catboost_pipline.fit(X_train, Y_train)
print(f'roc-auc_train = {roc_auc_score(Y_train, catboost_pipline.predict_proba(X_train)[:,1])}')
print(f'roc-auc_test = {roc_auc_score(Y_test, catboost_pipline.predict_proba(X_test)[:,1])}')

roc-auc_train = 0.9626327060281114
roc-auc_test = 0.8187537219882801


In [32]:
lightgbm_pipline = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", LGBMClassifier())])

lightgbm_pipline.fit(X_train, Y_train)
print(f'roc-auc_train = {roc_auc_score(Y_train, lightgbm_pipline.predict_proba(X_train)[:,1])}')
print(f'roc-auc_test = {roc_auc_score(Y_test, lightgbm_pipline.predict_proba(X_test)[:,1])}')

[LightGBM] [Info] Number of positive: 356, number of negative: 1052
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 614
[LightGBM] [Info] Number of data points in the train set: 1408, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.252841 -> initscore=-1.083518
[LightGBM] [Info] Start training from score -1.083518
roc-auc_train = 0.9938920515230486
roc-auc_test = 0.7945819235075087


На тестовой выборке в лидерах пока CatBoost, попробуем подобрать гиперпараметры

In [74]:
sklearn_pipline_grid = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", GradientBoostingClassifier())])

param_grid = {
    'classifier__max_depth': [2,3,4,], 
    'classifier__n_estimators': [20,100,200], 
    #'classifier__learning_rate': [0.05, 0.1, 0.15], 
    }

search = GridSearchCV(sklearn_pipline_grid, param_grid, n_jobs=1, cv=5, scoring=make_scorer(f1_score, average='macro'))

search.fit(X_train, Y_train)

print(f'{search.best_score_=}')
print(f'{search.best_params_=}')
print(f'roc-auc_train = {roc_auc_score(Y_train, search.predict_proba(X_train)[:,1])}')
print(f'roc-auc_test = {roc_auc_score(Y_test, search.predict_proba(X_test)[:,1])}')

print(f1_score(Y_test, search.predict(X_test)))

search.best_score_=0.6806500808757294
search.best_params_={'classifier__max_depth': 3, 'classifier__n_estimators': 100}
roc-auc_train = 0.9265564254282908
roc-auc_test = 0.8162915575925674
0.5333839150227617
