## Customer Churn Prediction Model

Using the [Telco customer churn data](https://www.kaggle.com/code/mechatronixs/telco-churn-prediction-feature-engineering-eda/data) from kaggle, train a machine learning model to predict customer churn.

In [51]:
import pandas as pd

In [52]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training 
train.drop("customerID", axis=1, inplace=True)

# getting validation data
val = pd.read_csv("./data/validation_data.csv")

In [53]:
train.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,5282.0,5282.0,5282.0
mean,0.16471,32.375426,64.893449
std,0.370954,24.595876,30.142527
min,0.0,0.0,18.25
25%,0.0,8.0,35.5625
50%,0.0,29.0,70.35
75%,0.0,56.0,89.9875
max,1.0,72.0,118.75


In [54]:
train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,No,No,5,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,Month-to-month,No,Bank transfer (automatic),75.15,392.65,No
1,Male,0,Yes,No,66,Yes,Yes,DSL,No,No,No,Yes,No,Yes,Month-to-month,No,Electronic check,63.85,4264.6,No
2,Male,0,Yes,Yes,42,Yes,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,No,Electronic check,73.15,3088.25,No
3,Male,0,No,No,19,Yes,Yes,DSL,No,No,No,No,Yes,Yes,Month-to-month,No,Mailed check,69.60,1394.55,No
4,Male,0,No,No,59,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),20.20,1192.3,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,Male,0,No,No,1,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Bank transfer (automatic),20.20,20.2,No
5278,Male,0,Yes,No,2,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Electronic check,76.40,151.8,Yes
5279,Female,0,Yes,No,58,Yes,Yes,DSL,Yes,No,No,Yes,No,Yes,One year,Yes,Electronic check,68.40,3972.25,No
5280,Female,0,No,No,1,Yes,No,Fiber optic,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,75.70,75.7,Yes


In [140]:
train['TotalCharges'] = train['TotalCharges'].replace(['', ' '], 0, inplace=True)
val['TotalCharges'] = val['TotalCharges'].replace(['', ' '], 0, inplace=True)


train['TotalCharges'] = pd.to_numeric(train['TotalCharges'])
val['TotalCharges'] = pd.to_numeric(val['TotalCharges'])

binary_cols = [col for col in train.columns if train[col].dtype not in [int, float]
               and train[col].nunique() == 2]
binary_cols

from sklearn.preprocessing import LabelEncoder

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


for col in binary_cols:
    train_new = label_encoder(train, col)
    
def one_hot_encoder(dataframe, categorical_cols, drop_first=True):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

ohe_cols = [col for col in train_new.columns if 30 >= train_new[col].nunique() > 2]
train_new = one_hot_encoder(train_new, ohe_cols)

In [141]:
dataTypeSeries = train_new.dtypes
dataTypeSeries

gender                                     int64
SeniorCitizen                              int64
Partner                                    int64
Dependents                                 int64
tenure                                     int64
PhoneService                               int64
PaperlessBilling                           int64
MonthlyCharges                           float64
TotalCharges                              object
Churn                                      int64
MultipleLines_No phone service             uint8
MultipleLines_Yes                          uint8
InternetService_Fiber optic                uint8
InternetService_No                         uint8
OnlineSecurity_No internet service         uint8
OnlineSecurity_Yes                         uint8
OnlineBackup_No internet service           uint8
OnlineBackup_Yes                           uint8
DeviceProtection_No internet service       uint8
DeviceProtection_Yes                       uint8
TechSupport_No inter

In [142]:
train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,0,0,5,1,No,DSL,No,No,Yes,Yes,Yes,Yes,Month-to-month,0,Bank transfer (automatic),75.15,,0
1,1,0,1,0,66,1,Yes,DSL,No,No,No,Yes,No,Yes,Month-to-month,0,Electronic check,63.85,,0
2,1,0,1,1,42,1,No,DSL,No,No,Yes,Yes,Yes,Yes,One year,0,Electronic check,73.15,,0
3,1,0,0,0,19,1,Yes,DSL,No,No,No,No,Yes,Yes,Month-to-month,0,Mailed check,69.60,,0
4,1,0,0,0,59,1,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,1,Bank transfer (automatic),20.20,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,1,0,0,0,1,1,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,0,Bank transfer (automatic),20.20,,0
5278,1,0,1,0,2,1,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,0,Electronic check,76.40,,1
5279,0,0,1,0,58,1,Yes,DSL,Yes,No,No,Yes,No,Yes,One year,1,Electronic check,68.40,,0
5280,0,0,0,0,1,1,No,Fiber optic,No,Yes,No,No,No,No,Month-to-month,1,Electronic check,75.70,,1


In [143]:
binary_cols = [col for col in val.columns if val[col].dtype not in [int, float]
               and val[col].nunique() == 2]
binary_cols
for col in binary_cols:
    new_val = label_encoder(val, col)
    
ohe_cols = [col for col in new_val.columns if 30 >= new_val[col].nunique() > 2]
new_val = one_hot_encoder(new_val, ohe_cols)

In [144]:
import numpy as np

y_train = train_new['Churn']
X_train = train_new.drop(["Churn"], axis=1)
y_val = new_val['Churn']
X_val = new_val.drop(["Churn"], axis=1)


In [145]:
train_new.dtypes

gender                                     int64
SeniorCitizen                              int64
Partner                                    int64
Dependents                                 int64
tenure                                     int64
PhoneService                               int64
PaperlessBilling                           int64
MonthlyCharges                           float64
TotalCharges                              object
Churn                                      int64
MultipleLines_No phone service             uint8
MultipleLines_Yes                          uint8
InternetService_Fiber optic                uint8
InternetService_No                         uint8
OnlineSecurity_No internet service         uint8
OnlineSecurity_Yes                         uint8
OnlineBackup_No internet service           uint8
OnlineBackup_Yes                           uint8
DeviceProtection_No internet service       uint8
DeviceProtection_Yes                       uint8
TechSupport_No inter

In [146]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

model = GradientBoostingClassifier().fit(X_train,y_train)

y_pred = model.predict(X_val)

Success = accuracy_score(y_val, y_pred)
print(primitiveSuccess)


ValueError: could not convert string to float: ''