In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv


In [2]:
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


**First of all, drop customerID column as it is of no use**

In [3]:
df.drop('customerID',axis='columns',inplace=True)

In [4]:
df.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

**OBSERVATION** - Quick glance at above makes me realize that TotalCharges should be float but it is an object. Let's check what's going on with this column



Now using `to_numeric` method I will change the object into numerical data

In [5]:
pd.to_numeric(df.TotalCharges)

ValueError: Unable to parse string " " at position 488

**OBSERVATION** - Error is shown because some values seems to be not numbers but blank string in the **TotalCharge** column which `pandas` is not able to convert to numerical values. 

In [None]:
# THIS WILL CONVERT OBJECT WITHOUT BLANK STRING TO FLOAT WHILE BLANK STRING WILL BE CONVERTED INTO NaN
pd.to_numeric(df.TotalCharges,errors='coerce')

#### LET'S SAY THE I WANT TO SEE THE INDEXES OF ROWS WITH NULL VALUES

In [None]:
pd.to_numeric(df.TotalCharges,errors='coerce').isnull()

In [None]:
df[pd.to_numeric(df.TotalCharges,errors='coerce').isnull()]

In [None]:
# iloc - integer location
df.iloc[488]

In [None]:
df.iloc[488].TotalCharges

In [None]:
print(f"Total rows: {df.shape[0]}")
print(f"Rows with null values: {df[df.TotalCharges==' '].shape[0]}")

In [None]:
# NOW LETS DROP BLANK SPACES AS ONLY 11 ROWS HAVE NULL VALUES
df1 = df[df.TotalCharges!=' ']
print(df1.shape)
df1.head()

In [None]:
# OBJECT -> NUMERIC
df1.TotalCharges = pd.to_numeric(df1.TotalCharges)

In [None]:
df1.dtypes

In [None]:
def print_unique_col_values(df):
       for column in df:
            if df[column].dtypes=='object':
                print(f'{column}: {df[column].unique()}') 
print_unique_col_values(df1)

**Some of the columns have no internet service or no phone service, that can be replaced with a simple No**

In [None]:
df1.replace('No internet service','No',inplace=True)
df1.replace('No phone service','No',inplace=True)

In [None]:
print_unique_col_values(df1)

#### ORDINAL ENCODING

In [None]:
yes_no_columns = ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup',
                  'DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
for col in yes_no_columns:
    df1[col].replace({'Yes': 1,'No': 0},inplace=True)

In [None]:
for col in df1:
    print(f'{col}: {df1[col].unique()}') 

#### One hot encoding for categorical columns

In [None]:
df2 = pd.get_dummies(data=df1, columns=['InternetService','Contract','PaymentMethod','gender'])
df2.columns

In [None]:
df2.head(5)

In [None]:
df2.dtypes

In [None]:
df.describe()

In [None]:
df.describe().columns

In [None]:
cols_to_scale = []
for i in df.describe().columns:
    cols_to_scale.append(i)
cols_to_scale

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df2[cols_to_scale] = scaler.fit_transform(df2[cols_to_scale])

In [None]:
df2.describe()

In [None]:
for col in df2:
    print(f'{col}: {df2[col].unique()}')

## Train test split



In [None]:
X = df2.drop('Churn',axis='columns')
y = df2['Churn']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)

In [None]:
X_train.shape, X_test.shape

## BUIDLING A MODEL

In [None]:
import tensorflow as tf
from tensorflow import keras



model = keras.Sequential([
    keras.layers.Dense(26, input_shape=(27,), activation='relu'),
    keras.layers.Dense(15, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])


model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=100)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
yp = model.predict(X_test)

In [None]:
y_pred = []
for element in yp:
    if element > 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [None]:
from sklearn.metrics import confusion_matrix , classification_report

print(classification_report(y_test,y_pred))

In [None]:
import seaborn as sn
cm = tf.math.confusion_matrix(labels=y_test,predictions=y_pred)

plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')