In [86]:
import numpy as np
import pandas as pd
import scipy.linalg as sla
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import VarianceThreshold # Feature selector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from pandas.core.dtypes.cast import maybe_unbox_datetimelike
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [87]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [88]:
data = pd.read_csv("/kaggle/input/telcocustomer/telco-customer-churn.csv")
data.head()

In [89]:
data.describe()

In [90]:
# data.columns.tolist()

In [91]:
# Числовые признаки
num_cols = ['age',
 'annualincome',
 'calldroprate',
 'callfailurerate',
 'callingnum',
 'customerid',
 'monthlybilledamount',
 'numberofcomplaints',
 'numberofmonthunpaid',
 'numdayscontractequipmentplanexpiring',
 'penaltytoswitch',
 'totalminsusedinlastmonth',
 'unpaidbalance',
 'percentagecalloutsidenetwork',
 'totalcallduration',
 'avgcallduration',
]

# Категориальные признаки
cat_cols = [
 'customersuspended',
 'education',
 'gender',
 'homeowner',
 'maritalstatus',
 'occupation',
 'state',
 'usesinternetservice',
 'usesvoiceservice',

 'month'
]

feature_cols = num_cols + cat_cols
target_col = 'churn'

Проверка заполнения ячеек Nan и ' '.

In [92]:
# print(data.isna().sum())
# for item in feature_cols:
#     for i in range(data[item].shape[0]):
#         if data[item][i] == ' ':
#             print('Find ' ' in', item ,'number =', i, '!')

In [93]:
fig = plt.figure(figsize=(15, 15), dpi = 80)
for i,col in enumerate(num_cols):
    ax = fig.add_subplot(6, 3, i+1)
    ax.set_title(col)
    counts, bins = np.histogram(data[col])
    plt.hist(bins[:-1], bins, weights=counts)
plt.show()

In [94]:
fig, axes = plt.subplots(3, 4, sharex = True,  figsize=(24,32))
for i,col in enumerate(cat_cols):
    axes = fig.add_subplot(3, 4, i+1)
    data[cat_cols[i]].value_counts().plot.pie(sharex = True)
    
plt.show()

In [95]:
data[target_col].value_counts().plot.bar()

In [96]:
data.corr()

In [97]:
important_features = num_cols

In [98]:
X = data.copy()

y = X.pop('churn')
X = X.loc[:, important_features]

from sklearn.decomposition import PCA

# Create principal components
pca = PCA(n_components=2) #
X_pca = pca.fit_transform(X)


# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

data_pca = pd.concat([data, X_pca], axis=1)


In [99]:
data_pca.head()

In [100]:
X_train, X_test, y_train, y_test = train_test_split(data[feature_cols], data['churn'], train_size=0.8)

In [141]:
classifiers = []

classifiers.append({'name': 'XGB',    'model': XGBClassifier(), 'description': 'XGBClassifier', 'parameters' : {
        'selector__threshold': [0, 0.001],
#         'classifier__min_child_weight': [1, 5, 10],
#         'classifier__gamma': [0.5, 1, 1.5, 2, 5],
#         'classifier__subsample': [0.6, 0.8, 1.0],
#         'classifier__colsample_bytree': [0.6, 0.8, 1.0],
#         'classifier__max_depth': [3, 4, 5]
        }})
classifiers.append({'name': 'GB',     'model': GradientBoostingClassifier(),  'description': 'GradientBoostingClassifier', 'parameters' : {
        'selector__threshold': [0, 0.001, 0.01],
        'classifier__n_estimators':range(20,81,10),
		'classifier__max_depth':(6,8,10),
		'classifier__min_samples_split':(10,20,30)}
         })
classifiers.append({'name': 'KN',     'model': KNeighborsClassifier(),  'description': 'KNeighborsClassifier', 'parameters' : {
        'selector__threshold': [0, 0.001, 0.01],
        'classifier__n_neighbors': [1, 3, 5, 7, 10],
        'classifier__p': [1, 2],
        'classifier__leaf_size': [1, 5, 10, 15]
        }})
classifiers.append({'name': 'RandomForest',     'model': RandomForestClassifier(),  'description': 'RandomForestClassifier', 'parameters' : {
        'selector__threshold': [0, 0.001, 0.01],
#         'classifier__n_estimators': (5,10,15,20,30,50),
#         'classifier__max_depth': (None,10,20,30,40),
#         'classifier__min_samples_split': (3,2,5,10)
        }})
classifiers.append({'name': 'DecisionTree',     'model': DecisionTreeClassifier(),  'description': 'DecisionTreeClassifier', 'parameters' : {
         'selector__threshold': [0, 0.001, 0.01],
#          'classifier__splitter' : ('best', 'random'),
#          'classifier__max_depth': (10,15,20,30,40,50),
#          'classifier__min_samples_split' : (5,10,15,20)
        }})

In [142]:
def main(classifiers):
    print(classifiers['description'])
    prepare_pipeline = ColumnTransformer([
          ('scaler', StandardScaler(), num_cols),
          ('one_hot', OneHotEncoder(), cat_cols)
          ])

    pipe = Pipeline([
          ('preparation', prepare_pipeline),
          ('selector', VarianceThreshold()),
          ('classifier', classifiers['model'])
      ])

    try: 
        pipe.fit(X_train, y_train)
        print('Training set score: ' + str(pipe.score(X_train,y_train)))
        print('Test set score: ' + str(pipe.score(X_test,y_test)))
        grid = GridSearchCV(pipe, classifiers['parameters'], cv=2).fit(X_train, y_train)
        print('Training set score: ' + str(grid.score(X_train, y_train)))
        print('Test set score: ' + str(grid.score(X_test, y_test)))
        print(grid.best_params_)
        return classifiers['name'], grid
    except BaseException:
        print('error fit', classifiers['description'])
    finally: print('')

    return 




In [143]:
estimators = []
for classifier in classifiers:
  estimators.append(main(classifier))


In [144]:
ensemble = VotingClassifier(estimators, voting='hard')
ensemble.fit(X_train, y_train)
print('Training set score: ' + str(ensemble.score(X_train,y_train)))
print('Test set score: ' + str(ensemble.score(X_test,y_test)))