In [307]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
#!pip install seaborn
import seaborn as sns
import pickle

In [308]:
# Load the dataset
data = pd.read_csv('Churn_Modelling.csv')

In [309]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [310]:
#preprocessing the data
data = data.drop(["RowNumber","CustomerId",	"Surname"],axis=1)

In [311]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [312]:
# Encoding categorical variables
gender_label_encoder = LabelEncoder()
data["Gender"] = gender_label_encoder.fit_transform(data["Gender"])
data.Gender.value_counts()


Gender
1    5457
0    4543
Name: count, dtype: int64

In [313]:
# Encoding categorical variables for Geography
from sklearn.preprocessing import OneHotEncoder
geo_one_hot_encoder = OneHotEncoder()
geo_encoded = geo_one_hot_encoder.fit_transform(data[["Geography"]])


In [314]:
geo_one_hot_encoder.get_feature_names_out(["Geography"])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [315]:
Encode_df = pd.DataFrame(geo_encoded.toarray(),columns=geo_one_hot_encoder.get_feature_names_out(["Geography"]))

In [316]:
Encode_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [317]:
data = pd.concat([data.drop("Geography",axis=1),Encode_df],axis=1)

In [318]:
data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [319]:
#save the encoders 
with open('label_encoder_gender.pkl', 'wb') as f:
    pickle.dump(gender_label_encoder, f)
with open('one_hot_encoder_geo.pkl', 'wb') as f:
    pickle.dump(geo_one_hot_encoder, f)

In [320]:
#indepemdent and dependent variables
X = data.drop("Exited",axis=1)
y = data["Exited"]
print(X.shape)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X

(10000, 12)


Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,0.0,1.0,0.0


In [321]:
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

ANN IMPLEMENTATIOIN

In [322]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
import datetime


In [323]:
model = Sequential([Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
                    Dense(32, activation='relu'),
                    Dense(1,activation='sigmoid')])

In [324]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_21 (Dense)            (None, 64)                832       
                                                                 
 dense_22 (Dense)            (None, 32)                2080      
                                                                 
 dense_23 (Dense)            (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [325]:
#optimization
import tensorflow
opt = tensorflow.keras.optimizers.Adam(learning_rate=0.001)
lossfn = tensorflow.keras.losses.BinaryCrossentropy()
model.compile(optimizer=opt,loss=lossfn,metrics=["accuracy"])


In [326]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)



In [327]:
EarlyStopping_callback = EarlyStopping(monitor="val_loss",patience=5,restore_best_weights=True)

In [328]:
#training the model
history  = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,callbacks=[tensorboard_callback,EarlyStopping_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


In [329]:
model.save('model.h5')

  saving_api.save_model(


In [330]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [331]:
%tensorboard --logdir logs/fit/20250814-112533

Reusing TensorBoard on port 6006 (pid 21008), started 3:05:41 ago. (Use '!kill 21008' to kill it.)

In [332]:
# load  the model and scalar and pickle files
from tensorflow.keras.models import load_model
model = load_model("model.h5")
with open("label_encoder_gender.pkl","rb") as f :
    label_encoder_gen = pickle.load(f)
with open("one_hot_encoder_geo.pkl","rb") as f :
    one_hot_encoder_geo = pickle.load(f)
with open("scaler.pkl","rb") as f :
    scaler = pickle.load(f)

In [333]:
whole_dataset_input_data = pd.read_csv("Churn_Modelling.csv")
input_data= whole_dataset_input_data.iloc[1:2,]

In [334]:
input_data.shape

(1, 14)

In [335]:
input_data["Gender"] = label_encoder_gen.transform(input_data["Gender"])
input_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_data["Gender"] = label_encoder_gen.transform(input_data["Gender"])


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1,2,15647311,Hill,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0


In [336]:
one_hot_df = one_hot_encoder_geo.transform([input_data["Geography"]])



In [337]:
one_hot_df.toarray()


array([[0., 0., 1.]])

In [338]:
one_hot_encoder_geo .get_feature_names_out(["Geography"])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [339]:
onehot_df = pd.DataFrame(one_hot_df.toarray(),columns=one_hot_encoder_geo .get_feature_names_out(["Geography"]))
onehot_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,0.0,0.0,1.0


In [340]:
import pandas as pd
edited_input_data = pd.concat([input_data.drop(columns=["Geography"]
).reset_index(),onehot_df],axis=1)

In [341]:
edited_input_data = edited_input_data.iloc[:,4:]
edited_input_data.drop(columns=["Exited"],inplace=True)
edited_input_data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,608,0,41,1,83807.86,1,0,1,112542.58,0.0,0.0,1.0


In [None]:
scaled_data = scaler.transform(edited_input_data)

In [343]:
pred = model.predict(scaled_data)



In [344]:
if pred[0][0] > 0.5 :
    print("churn")
    print(pred)
else :
    print("not a churn")
    print(pred)


not a churn
[[0.21484663]]
