In [17]:
import pandas as pd
import numpy as np

# Scaling and Encoding
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Feature selection
from sklearn.feature_selection import SelectFromModel

# Pipiline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Neural Networks
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Accuracy metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# EDA
## Import data

In [18]:
train = pd.read_csv('churn-bigml-80.csv', encoding='utf-8')
test = pd.read_csv('churn-bigml-20.csv', encoding='utf-8')

## Descriptive analysis

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls       2666 non-null   int64  
 14  Total night charge      2666 non-null   

In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   667 non-null    object 
 1   Account length          667 non-null    int64  
 2   Area code               667 non-null    int64  
 3   International plan      667 non-null    object 
 4   Voice mail plan         667 non-null    object 
 5   Number vmail messages   667 non-null    int64  
 6   Total day minutes       667 non-null    float64
 7   Total day calls         667 non-null    int64  
 8   Total day charge        667 non-null    float64
 9   Total eve minutes       667 non-null    float64
 10  Total eve calls         667 non-null    int64  
 11  Total eve charge        667 non-null    float64
 12  Total night minutes     667 non-null    float64
 13  Total night calls       667 non-null    int64  
 14  Total night charge      667 non-null    fl

* There is no null values
* For further analysis, categorical columns should be converted into numerical.
    * For column 3,4: Yes=1, No=0
    * For `State`, `Area code` columns: One Hot encoding *(this step will be combined with Scaling later)*
## Cleaning
### Remove outliers
Z-score

In [22]:
def remove_outliers(df, threshold=2):
    numeric_cols = list(df.columns[1:2]) + list(df.columns[5:19])

    for col in numeric_cols:
        col_zscore = (df[col] - df[col].mean()) / df[col].std()
        df = df[(col_zscore < threshold) & (col_zscore > -threshold)]
        
    return df

In [23]:
train = remove_outliers(train)
test = remove_outliers(test)

print(len(train))
print(len(test))

1384
342


In [24]:
train.describe()

Unnamed: 0,Account length,Area code,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
count,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0,1384.0
mean,99.62789,438.362717,6.17052,180.413367,99.975434,30.670838,199.31185,100.104769,16.941756,202.110332,100.25289,9.095051,10.348699,4.156792,2.794675,1.467486
std,34.404755,43.064061,11.549064,44.582349,17.48366,7.57894,42.083341,17.617898,3.577088,41.757612,17.088801,1.879194,2.164196,1.928396,0.584285,1.139109
min,22.0,408.0,0.0,86.1,61.0,14.64,109.9,61.0,9.34,112.2,62.0,5.05,5.7,1.0,1.54,0.0
25%,74.0,408.0,0.0,146.725,87.75,24.9475,167.275,87.0,14.2175,170.3,88.0,7.66,8.8,3.0,2.38,1.0
50%,99.0,415.0,0.0,179.4,100.0,30.5,200.25,100.0,17.025,202.05,101.0,9.09,10.4,4.0,2.81,1.0
75%,124.0,510.0,0.0,213.425,113.0,36.285,230.4,113.0,19.58,233.7,113.0,10.52,12.0,5.0,3.24,2.0
max,179.0,510.0,35.0,275.2,139.0,46.78,289.3,140.0,24.59,290.0,139.0,13.05,14.9,9.0,4.02,4.0


In [27]:
test.describe()

Unnamed: 0,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls
count,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0,342.0
mean,102.789474,434.865497,0.087719,0.24269,6.657895,181.786842,101.885965,30.904415,204.093275,101.602339,17.347953,198.525731,99.961988,8.933626,10.52924,4.324561,2.843421,1.459064
std,35.651903,41.002675,0.283301,0.429337,12.122417,45.590944,18.036665,7.750406,39.087594,16.452894,3.322582,40.208674,18.032108,1.809633,2.182419,2.062778,0.589443,1.08971
min,25.0,408.0,0.0,0.0,0.0,81.9,62.0,13.92,120.7,63.0,10.26,115.9,60.0,5.22,5.7,1.0,1.54,0.0
25%,79.0,408.0,0.0,0.0,0.0,149.55,88.25,25.4225,174.2,89.0,14.8075,167.825,87.0,7.555,8.9,3.0,2.4,1.0
50%,102.0,415.0,0.0,0.0,0.0,180.3,101.0,30.65,202.9,102.0,17.25,200.2,100.0,9.01,10.6,4.0,2.86,1.0
75%,127.0,415.0,0.0,0.0,0.0,216.075,115.0,36.73,233.675,113.0,19.86,228.55,112.75,10.2875,12.1,6.0,3.27,2.0
max,183.0,510.0,1.0,1.0,36.0,281.4,141.0,47.84,287.7,138.0,24.45,283.2,139.0,12.74,15.2,9.0,4.1,4.0


### Converting data types
For column 3,4: Yes=1, No=0

In [25]:
def replace_y_n(df, column_name):
    if len(df[column_name].unique()) == 2: # to ensure there is no mispelling word
        df[column_name].replace({'Yes':1, 'No':0}, inplace=True)
        print(df[column_name].unique())

In [26]:
replace_y_n(train, 'International plan')
replace_y_n(test, 'International plan')
replace_y_n(train,'Voice mail plan')
replace_y_n(test, 'Voice mail plan')

[0 1]
[0 1]
[1 0]
[0 1]


## Prepare variables

In [28]:
X_train = train.drop('Churn', axis=1)
X_test = test.drop('Churn', axis=1)

y_train = train['Churn']
y_test = test['Churn']

# Preprocessing, Feature Selection & Main model
Here, I built a pipeline that contains:
* Preprocessing for numerical data: Standard scaling
* Preprocessing for categorical data: One-hot encoding
* Feature Selection
* Classifition model

## Preprocessing
Scaling and Encoding

In [29]:
# Define numerical and categorical columns
list_of_num_cols = list(train.columns[1:2]) + list(train.columns[5:19])
list_of_cat_cols = list(train.columns[0:1]) + list(train.columns[5:6])

In [30]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(transformers= [('num', numerical_transformer, list_of_num_cols)
                                                ,('cat', categorical_transformer, list_of_cat_cols)
                                                ]
                                )

## Feature Selection & Main model
As there are several estimators in `SelectFromModel`, I'd consider all of those
* Logistic Regression
* SVM
* Decision Tree
* Random Forest
* Gradient Boosting

In [31]:
def models(feature_selection, classifier):
    # define Feature Selection and Classifier
    feature_selector = SelectFromModel(feature_selection)
    model = classifier
    
    
    # Combine into Pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('feature_selection', feature_selector),
                           ('model', model)])
    
    
    # Train model
    pipeline.fit(X_train,y_train)
    
        
    # Accuracy checks
    y_pred = pipeline.predict(X_test)

    print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
    print(f'Precision: {precision_score(y_test, y_pred):.2f}')
    print(f'Recall: {recall_score(y_test, y_pred):.2f}')
    print(f'F1 Score: {f1_score(y_test, y_pred):.2f}')
    print('---')

In [32]:
list_of_models = [LogisticRegression()
                  , LinearSVC()
                  , DecisionTreeClassifier()
                  , RandomForestClassifier()
                  , GradientBoostingClassifier()
                  ]

In [33]:
for classifier in list_of_models:
    for feature_selector in list_of_models:
        models(feature_selection=feature_selector, classifier=classifier)

Accuracy: 0.90
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---
Accuracy: 0.91
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.91
Precision: 0.67
Recall: 0.06
F1 Score: 0.11
---
Accuracy: 0.91
Precision: 0.50
Recall: 0.03
F1 Score: 0.06
---
Accuracy: 0.91
Precision: 0.50
Recall: 0.03
F1 Score: 0.06
---
Accuracy: 0.91
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---
Accuracy: 0.91
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.91
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.91
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.91
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---
Accuracy: 0.88
Precision: 0.25
Recall: 0.16
F1 Score: 0.19
---
Accuracy: 0.91
Precision: 0.00
Recall: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


F1 Score: 0.00
---
Accuracy: 0.86
Precision: 0.29
Recall: 0.31
F1 Score: 0.30
---
Accuracy: 0.87
Precision: 0.29
Recall: 0.28
F1 Score: 0.29
---
Accuracy: 0.86
Precision: 0.28
Recall: 0.31
F1 Score: 0.29
---
Accuracy: 0.87
Precision: 0.21
Recall: 0.12
F1 Score: 0.16
---




Accuracy: 0.91
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.93
Precision: 0.89
Recall: 0.25
F1 Score: 0.39
---
Accuracy: 0.92
Precision: 0.88
Recall: 0.22
F1 Score: 0.35
---
Accuracy: 0.92
Precision: 0.88
Recall: 0.22
F1 Score: 0.35
---
Accuracy: 0.90
Precision: 0.33
Recall: 0.03
F1 Score: 0.06
---




Accuracy: 0.91
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
---


  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.92
Precision: 0.60
Recall: 0.28
F1 Score: 0.38
---
Accuracy: 0.91
Precision: 0.57
Recall: 0.25
F1 Score: 0.35
---
Accuracy: 0.91
Precision: 0.57
Recall: 0.25
F1 Score: 0.35
---
