# Import libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif,mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Import dataset

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


There is no missing values in our dataset

In [5]:
list1 = df.columns.to_list()
list1

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [6]:
list1.remove('customerID')

In [7]:
for i in list1:
    value = df[i].unique()
    print(i ,':', value)

gender : ['Female' 'Male']
SeniorCitizen : [0 1]
Partner : ['Yes' 'No']
Dependents : ['No' 'Yes']
tenure : [ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
PhoneService : ['No' 'Yes']
MultipleLines : ['No phone service' 'No' 'Yes']
InternetService : ['DSL' 'Fiber optic' 'No']
OnlineSecurity : ['No' 'Yes' 'No internet service']
OnlineBackup : ['Yes' 'No' 'No internet service']
DeviceProtection : ['No' 'Yes' 'No internet service']
TechSupport : ['No' 'Yes' 'No internet service']
StreamingTV : ['No' 'Yes' 'No internet service']
StreamingMovies : ['No' 'Yes' 'No internet service']
Contract : ['Month-to-month' 'One year' 'Two year']
PaperlessBilling : ['Yes' 'No']
PaymentMethod : ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
MonthlyCharges : [29.85 56.95 53.85 ... 63.1  44.2

In [8]:
df['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [9]:
df['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [10]:
#Data distribution :
No = ((5174)/(5174+1869))*100 
Yes = ((1869)/(5174+1869))*100 
print('Yes class distribution',Yes,'%')
print('No class distribution',No,'%')

Yes class distribution 26.536987079369588 %
No class distribution 73.4630129206304 %


Our dataset is balanced

In [11]:
df.loc[df['TotalCharges'] == ' ', 'TotalCharges'] = 0

# Converting Categorical columns to numerical:

In [12]:
df['TotalCharges'] = df['TotalCharges'].astype(float, errors='raise')

In [13]:
object_cols = df.select_dtypes(include='object').columns.tolist()
#object_cols.remove('customerID')

In [14]:
encoder = LabelEncoder()

# Dictionary to store encoders for each column
encoders = {}
# Encode each categorical column
for col in object_cols:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])

# Decode each encoded column
# for col, encoder in encoders.items():
#     df[f'{col}_Decoded'] = encoder.inverse_transform(df[col])   

In [15]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,5375,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,2,29.85,29.85,0
1,3962,1,0,0,0,34,1,0,0,2,...,2,0,0,0,1,0,3,56.95,1889.5,0
2,2564,1,0,0,0,2,1,0,0,2,...,0,0,0,0,0,1,3,53.85,108.15,1
3,5535,1,0,0,0,45,0,1,0,2,...,2,2,0,0,1,0,0,42.3,1840.75,0
4,6511,0,0,0,0,2,1,0,1,0,...,0,0,0,0,0,1,2,70.7,151.65,1


# Minmax scaling:

In [16]:
minmax_col = df.columns.to_list()

In [17]:
minmax_col.remove('customerID')
minmax_col.remove('Churn')

In [18]:
scaler = MinMaxScaler()

In [19]:
df[minmax_col] = scaler.fit_transform(df[minmax_col])

In [20]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,5375,0.0,0.0,1.0,0.0,0.013889,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.666667,0.115423,0.003437,0
1,3962,1.0,0.0,0.0,0.0,0.472222,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.5,0.0,1.0,0.385075,0.217564,0
2,2564,1.0,0.0,0.0,0.0,0.027778,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.354229,0.012453,1
3,5535,1.0,0.0,0.0,0.0,0.625,0.0,0.5,0.0,1.0,...,1.0,1.0,0.0,0.0,0.5,0.0,0.0,0.239303,0.211951,0
4,6511,0.0,0.0,0.0,0.0,0.027778,1.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.666667,0.521891,0.017462,1


# Train test split

In [21]:
X = df.drop(columns=['Churn', 'customerID'])
y = df['Churn'].to_frame()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)

# Model:

# ANN:

In [23]:
model_ann = Sequential()
model_ann.add(Dense(64, activation="relu", input_shape=(X_train.shape[1],)))
model_ann.add(Dense(32, activation="relu"))
model_ann.add(Dense(1, activation="sigmoid"))

# Compile the model
model_ann.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model_ann.fit(X_train, y_train, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1f3a4e38cd0>

In [24]:
model_ann.evaluate(X_test,y_test)



[0.46982887387275696, 0.7849538922309875]

# Deep FNN:

# Hyperparameter tuning  Deep Neural network:

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from kerastuner import HyperModel

class DNNHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = Sequential()
        model.add(Dense(units=hp.Int('units', min_value=32, max_value=128, step=16),
                        activation='relu', input_shape=self.input_shape))
        
        for i in range(hp.Int('num_layers', 1, 3)):
            model.add(Dense(units=hp.Int(f'layer_{i}_units', min_value=32, max_value=128, step=16),
                            activation='relu'))
            model.add(Dropout(rate=hp.Float(f'layer_{i}_dropout', min_value=0.1, max_value=0.5, step=0.1)))
        
        model.add(Dense(1, activation='sigmoid'))

        optimizer = tf.keras.optimizers.Adam(
            lr=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='log')
        )

        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

        return model


In [26]:
from kerastuner.tuners import RandomSearch

input_shape = (19,)  # Replace with the number of features in your dataset
hypermodel = DNNHyperModel(input_shape)

tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=30,
    seed=42,
    directory='output',
    project_name='ann_tuner'
)

tuner.search(X_train, y_train, epochs=50, validation_data=(X_test, y_test))


INFO:tensorflow:Reloading Tuner from output\ann_tuner\tuner0.json
INFO:tensorflow:Oracle triggered exit


In [27]:
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the best model
best_model = hypermodel.build(best_hyperparameters)

# Train the best model with the full dataset
best_model.fit(X_train, y_train, epochs=50)




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1f3a78d9370>

In [28]:
best_model.evaluate(X_test,y_test)



[0.4549309313297272, 0.7920510768890381]

In [29]:
best_model.save('model.h5')

In [32]:
input_data = [ 'Female', 0, 'No', 'No', 1, 'Yes', 'No phone service', 'DSL', 'No','Yes',
    'No', 'No', 'No', 'No', 'Month-to-month', 'Yes', 'Electronic check', 50, 100
]

In [56]:
prediction = best_model.predict([values_list])
prediction



array([[0.58214253]], dtype=float32)

In [55]:
# Accessing the first row of the DataFrame
row = X_test.iloc[1]

# Converting the row to a NumPy array
values_np = row.values

# Converting the row to a list
values_list = row.tolist()
values_list
y_test.iloc[1]

Churn    0
Name: 5035, dtype: int32