In [62]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
import seaborn as sns

In [63]:
import matplotlib.pyplot as plt

In [64]:
df = pd.read_csv(r"D:\Sem-6\DSBDA\lab 05\Social_Network_Ads.csv")

In [65]:
df.shape

(400, 5)

In [66]:
df.columns

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [68]:
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [69]:
df["Gender"].value_counts()

Gender
Female    204
Male      196
Name: count, dtype: int64

In [70]:
def gender_encoder(value):
    if (value == "Male"):
        return 1
    elif (value == "Female"):
        return 0
    else:
        return -1

In [71]:
df["Gender"] = df["Gender"].apply(gender_encoder)

In [72]:
df["Purchased"].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

# Data preprocessing

In [74]:
df.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [75]:
x = df[["Age","EstimatedSalary"]]
y = df["Purchased"]

In [76]:
x

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000
...,...,...
395,46,41000
396,51,23000
397,50,20000
398,36,33000


In [77]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

# splitting data for training and testing

In [78]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state=0)

# Standardizing Data

In [79]:
scaler = StandardScaler()

In [80]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modelling

In [81]:
model = LogisticRegression(random_state=0)
model.fit(X_train_scaled,y_train)

In [82]:
y_pred = model.predict(X_test_scaled)

# Evaluation

In [83]:
cm = confusion_matrix(y_test,y_pred)

In [84]:
cm

array([[65,  3],
       [ 8, 24]], dtype=int64)

In [86]:
TN,FP,FN,TP = cm.ravel()
accuracy = accuracy_score(y_test,y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print("confusion Matrix:")
print(cm)
print("True Positives (TP):",TP)
print("False Positives (FP):",FP)
print("True Negative (TN):",TN)
print("False Negative (TN):",FN)
print("Accuracy:",accuracy)
print("Error Rate:",error_rate)
print("Precision:",precision)
print("Recall(Sensitivity):",recall)
print("F1 Score:",f1)

confusion Matrix:
[[65  3]
 [ 8 24]]
True Positives (TP): 24
False Positives (FP): 3
True Negative (TN): 65
False Negative (TN): 8
Accuracy: 0.89
Error Rate: 0.10999999999999999
Precision: 0.8888888888888888
Recall(Sensitivity): 0.75
F1 Score: 0.8135593220338982


In [87]:
# M2
# Compute Accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)

# Compute Error Rate
error_rate = (FP + FN) / (TP + TN + FP + FN)

# Compute Precision
precision = TP / (TP + FP)

# Compute Recall (Sensitivity)
recall = TP / (TP + FN)

In [88]:
accuracy

0.89

In [89]:
error_rate

0.11

In [90]:
precision

0.8888888888888888

In [91]:
recall

0.75