## Salary Regression Model

#### Data Preprocessing 

In [2]:
## Importing necessary libraries 
import pandas as pd 
import numpy as np
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder , OneHotEncoder




In [3]:
## Loading our Dataset 
df = pd.read_csv("Churn_Modelling.csv")
df.head(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [4]:
## dropping unnecessary columns
df.drop(["RowNumber","CustomerId","Surname"],axis=1,inplace=True)
df.head(10)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [5]:
## Encoding Columns : Converting Categorical Columns into numerical columns 

## converting "Gender" column
lab_encoder = LabelEncoder()
df["Gender"] = lab_encoder.fit_transform(df["Gender"])
df["Gender"]

0       0
1       0
2       0
3       0
4       0
       ..
9995    1
9996    1
9997    0
9998    1
9999    0
Name: Gender, Length: 10000, dtype: int32

In [6]:
## encoding "Geography" column 
ohe_encoder = OneHotEncoder()
geo_encoder = ohe_encoder.fit_transform(df[["Geography"]])
geo_encoder

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [7]:
## converting it to dataframe
geo_df = pd.DataFrame(geo_encoder.toarray(),columns=ohe_encoder.get_feature_names_out())
geo_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [8]:
## merging above geo_df with our original data 
df = pd.concat([geo_df,df.drop("Geography",axis=1)],axis=1)
df.head(10)

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1.0,0.0,0.0,619,0,42,2,0.0,1,1,1,101348.88,1
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58,0
2,1.0,0.0,0.0,502,0,42,8,159660.8,3,1,0,113931.57,1
3,1.0,0.0,0.0,699,0,39,1,0.0,2,0,0,93826.63,0
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.1,0
5,0.0,0.0,1.0,645,1,44,8,113755.78,2,1,0,149756.71,1
6,1.0,0.0,0.0,822,1,50,7,0.0,2,1,1,10062.8,0
7,0.0,1.0,0.0,376,0,29,4,115046.74,4,1,0,119346.88,1
8,1.0,0.0,0.0,501,1,44,4,142051.07,2,0,1,74940.5,0
9,1.0,0.0,0.0,684,1,27,2,134603.88,1,1,1,71725.73,0


In [9]:
## Data is now completely cleaned so now we can save it
import pickle 
with open("reg_lab_encoder.pkl","wb") as file :
    pickle.dump(lab_encoder,file)

with open("reg_ohe_encoder.pkl","wb") as file :
    pickle.dump(ohe_encoder,file)

In [10]:
## Splitting the data into Input and Output Features 
X = df.drop("EstimatedSalary",axis=1) ## Input 
y = df["EstimatedSalary"] ## Output 

In [11]:
## Applying train test split 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
## Applying standardization 
reg_scaler = StandardScaler()
X_train_scaled = reg_scaler.fit_transform(X_train)
X_test_scaled =  reg_scaler.transform(X_test)

In [13]:
## saving the scalar file 
with open("reg_scaler.pkl","wb") as file :
    pickle.dump(reg_scaler,file)

#### Model Training and Model Building 

In [38]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 
from tensorflow.keras.callbacks import EarlyStopping , TensorBoard
import datetime 

In [28]:
(X_train_scaled.shape[1],)

(12,)

In [None]:
## ANN Model
model = Sequential(
    [
        Dense(64,activation="relu",input_shape=(X_train_scaled.shape[1],)), ## HL 1
        Dense(32,activation="relu"), ## HL 2 
        Dense(1,activation="linear") ## Output Layer (linear is not mandatory ( default ))
    ]
)

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_29 (Dense)            (None, 64)                832       
                                                                 
 dense_30 (Dense)            (None, 32)                2080      
                                                                 
 dense_31 (Dense)            (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:
## Compiling our model 
model.compile(optimizer="adam",loss="mean_absolute_error",metrics=["mae"])




In [41]:
## Setting up TensorBoard and Log Directory
log_directory = "reg_logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_directory,histogram_freq=1)

earlystopping_callback = EarlyStopping(monitor="val_loss",patience=10,restore_best_weights=True)


In [42]:
## fitting the model 
history = model.fit(
    X_train,y_train,validation_data = (X_test,y_test) , epochs = 100 , callbacks = [earlystopping_callback,tensorflow_callback]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


In [44]:
## model is trained , now we are saving it 
model.save("reg_model.h5")

  saving_api.save_model(


In [47]:
## Load TensorBoard Extension using magic functions
%load_ext tensorboard

In [50]:
%tensorboard --logdir reg_logs/fit/

Reusing TensorBoard on port 6006 (pid 15884), started 0:00:12 ago. (Use '!kill 15884' to kill it.)

In [None]:
## Evaluating the model on test data 
test_loss,test_mae=model.evaluate(X_test,y_test)
print("Test data MAE score = " , test_mae)

Test data MAE score =  50307.78125


In [52]:
## if MAE more near to zero means our model is trained nicely 

#### Prediction 

In [4]:
## Loading the saved files 
import pickle
import pandas as pd
from tensorflow.keras.models import load_model 
model = load_model("reg_model.h5")

with open("reg_lab_encoder.pkl","rb") as file :
    reg_lab_encoder = pickle.load(file)
with open("reg_ohe_encoder.pkl","rb") as file :
    reg_ohe_encoder = pickle.load(file)
with open("reg_scaler.pkl","rb") as file :
    reg_scaler = pickle.load(file)

In [24]:
## Sample input data
input_data = {
    'Geography_France':1.0,
    'Geography_Germany':0.0,
    'Geography_Spain':0.0,
    'CreditScore': 600,
    'Gender': 0 ,
    'Age': 40,
    'Tenure': 3,
    'Balance': 100000,
    'NumOfProducts': 2,
    'HasCrCard': 1,
    'IsActiveMember': 1,
    'Exited': 1
}

## converting to dataframe
input_df = pd.DataFrame([input_data])
input_df

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,1.0,0.0,0.0,600,0,40,3,100000,2,1,1,1


In [25]:
input_df_scaled = reg_scaler.transform(input_df)
input_df_scaled

array([[ 1.00150113, -0.57946723, -0.57638802, -0.53598516, -1.09499335,
         0.10479359, -0.69539349,  0.38262839,  0.80843615,  0.64920267,
         0.97481699,  1.96626042]])

In [26]:
prediction = model.predict(input_df_scaled)
prediction



array([[568.04364]], dtype=float32)

In [27]:
predicted_salary = prediction[0][0]
print(f'Predicted Estimated Salary = $ {predicted_salary:.2f}')

Predicted Estimated Salary = $ 568.04


In [None]:
## we can also hyperparameter tune our model using grid search CV 
import pandas as pd 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import LabelEncoder , OneHotEncoder , StandardScaler
from sklearn.pipeline import Pipeline 
from scikeras.wrappers import KerasClassifier 
import tensorflow 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense 
from tensorflow.keras.callbacks import EarlyStopping , TensorBoard
import pickle


## Defining a function to create model that tries different parameters (Keras Classifier)
def model_create(neurons=32, layers=1):
    model = Sequential()
    model.add(Dense(neurons, activation="relu", input_shape=(X_train.shape[1],))) ## HL1 (default)

    for _ in range(layers - 1):
        model.add(Dense(neurons, activation="relu")) ## for creating specified no. of hidden layers

    model.add(Dense(1, activation="linear"))

    model.compile(optimizer="adam", loss="mean_absolute_error", metrics=["mae"])
    return model

## Creating our model : Keras Classifier 
model = KerasClassifier(model=model_create, neurons=32, layers=1, verbose=0)

## Performing GridSearchCV
## Defining the parameters for performing Grid Search CV
grid_params = {
    "neurons": [16, 32, 64, 128],
    "layers": [1, 2],
    "epochs": [50, 100],
    "batch_size": [10]
}

grid = GridSearchCV(estimator=model, param_grid=grid_params, n_jobs=-1, cv=3, verbose=1)
grid_result = grid.fit(X_train, y_train)

print(f"Best Score: {grid_result.best_score_}")
print(f"Best Params: {grid_result.best_params_}")

## It will return us the best parameters for training our ANN Model
