# Adviewers Classification

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from scipy.stats import skew, kurtosis
import seaborn as sns
import matplotlib.pyplot as plt

### Create DataFrame

In [None]:
np.random.seed(42)
n = 10000

age = np.random.randint(18, 60, size=n)
gender = np.random.choice([0, 1], size=n)
estimated_salary = np.random.normal(50_000_000, 30_000_000, size=n).clip(5_000_000, 200_000_000)
time_on_page = np.random.normal(60, 30, size=n).clip(5, 300) 
ads_viewed = np.random.poisson(4, size=n).clip(0, 20)
clicked_ad = np.random.binomial(1, p=ads_viewed/20) 
registered = np.random.binomial(1, p=(clicked_ad * 0.7 + 0.2))


purchase_prob = (
    0.2 * (clicked_ad) +
    0.3 * (registered) +
    0.1 * (time_on_page > 60).astype(int) +
    0.1 * (ads_viewed > 5).astype(int) +
    0.05 * (estimated_salary > 50_000_000).astype(int)
)
purchase_prob = np.clip(purchase_prob, 0, 1)
purchased = np.random.binomial(1, p=purchase_prob)


df = pd.DataFrame({
    'Age': age,
    'Gender': gender,
    'EstimatedSalary': estimated_salary.astype(int),
    'TimeOnPage': time_on_page.round(1),
    'AdsViewed': ads_viewed,
    'ClickedAd': clicked_ad,
    'Registered': registered,
    'Purchased': purchased
})


df.loc[df.sample(frac=0.05).index, 'Age'] = np.nan
df.loc[df.sample(frac=0.03).index, 'EstimatedSalary'] = np.nan
df.loc[df.sample(frac=0.02).index, 'Gender'] = np.nan


df

### Processing

In [None]:
df.info()

In [None]:
print(df.isnull().mean()*100)

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

In [None]:
df['Clicked&Registed'] = df['ClickedAd'] & df['Registered']
df['AdPerMin'] = df['AdsViewed'] / df['TimeOnPage']

df['AgeGroup'] = pd.cut(df['Age'],bins=[17,25,35,50,100],labels=['Young','Adult','Middle-aged','Elderly'])
df['SalaryRange'] = pd.cut(df['EstimatedSalary'],bins=[0,20000000,50000000,100000000,np.inf],labels=['Low','Middle','High','Very High'])
df['EngagementScore'] = df['TimeOnPage'] * (df['ClickedAd'] + 1) * (df['Registered'] + 1)

categorical = ['Gender','ClickedAd','Registered','Clicked&Registed','AgeGroup','SalaryRange']

for c in categorical:
    df[c] = df[c].astype('category')

df['Purchased'] = df['Purchased'].astype('category')

df['Gender'] = df['Gender'].map({0: 'Male', 1: 'Female'})

for c in ['ClickedAd','Registered','Clicked&Registed']:
    df[c] = df[c].map({0:'No',1:'Yes'})


In [None]:
df.info()

In [None]:
numerical = ['Age','EstimatedSalary','TimeOnPage','AdsViewed','AdPerMin','EngagementScore']

In [None]:
num_stats ={}
fig, axes = plt.subplots(2,3,figsize=(24,14))
for r in range(2):
    for c in range(3):
        col = numerical[r*3 + c]
        fea = df[col]
        sk = skew(fea)
        kur = kurtosis(fea)
        num_stats[col] = sk,kur
        bins = 20
        if col == 'AdsViewed': bins = 13
        sns.histplot(df,x=fea,kde= True,bins=bins,hue='Purchased',palette='muted',ax=axes[r,c])
        axes[r,c].set_title(f'Purchased based on {col}',fontsize=16)
        axes[r,c].set_xlabel(f'{col}',fontsize=14)
        axes[r,c].set_ylabel('Count',fontsize=14)
plt.tight_layout()        
plt.show()

In [None]:
fig, axes = plt.subplots(2,3,figsize=(24,14))
for r in range(2):
    for c in range(3):
        col = categorical[r*3 + c]
        fea = df[col]
        sns.countplot(data=df,x=fea,hue='Purchased',palette='deep',ax=axes[r,c])
        axes[r,c].set_title(f'Purchased based on {col}',fontsize=16)
        axes[r,c].set_xlabel(f'{col}',fontsize=14)
        axes[r,c].set_ylabel('Count',fontsize=14)
plt.tight_layout()        
plt.show()
    

In [None]:
def remove_outliers(df,fea):
    Q1 = df[fea].quantile(0.25)
    Q3 = df[fea].quantile(0.75)
    IRQ = Q3 - Q1

    lb = Q1 - 1.5*IRQ
    ub = Q3 + 1.5*IRQ

    filtered_df = df[(df[fea]>=lb) & (df[fea]<=ub)]
    return filtered_df

for fea in numerical:
    sk, kur = num_stats[fea]
    abs_sk = abs(sk)
    
    if abs_sk > 5:
        df[fea] =  np.log1p(df[fea])
    elif abs_sk > 3:
        df[fea] =  (df[fea])**0.3
    elif abs_sk > 1:
        df.loc[:, fea] = np.sqrt(df[fea])

    if kur > 5:
        df = remove_outliers(df,fea)
    elif kur <-5:
        df[fea] = StandardScaler().fit_transform(df[fea])

In [None]:
class LogisticRegression:
    def __init__(self, learningrate=0.01, epochs=1000):
        self.lr = learningrate
        self.epochs = epochs
        self.w = None
        self.b = None
        self.loss_list = []

    def initial(self, n_features: int):
        self.w = np.zeros(n_features)
        self.b = 0

    def sigmoid(self, x):
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))

    def get_loss_list(self):
        return self.loss_list

    def fit(self, X, Y):
        X = np.array(X)
        Y = np.array(Y)

        if X.ndim == 1:
            X.resize(-1, 1)

        len_val, len_fea = X.shape
        self.initial(len_fea)
        for epoch in range(self.epochs):

            Y_hat = np.clip(
                self.sigmoid(np.dot(X, self.w) + self.b), 1e-15, 1 - 1e-15
            )
            cross_entropy = -np.mean(
                Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat)
            )
            self.loss_list.append(cross_entropy)

            dw = np.dot(X.T, (Y_hat - Y)) / len_val
            db = np.sum(Y_hat - Y) / len_val

            self.w -= self.lr * dw
            self.b -= self.lr * db

    def predict(self, X, threshold=0.5):
        X = np.array(X)
        Z = np.dot(X, self.w) + self.b
        Y_pred = self.sigmoid(Z)
        return [1 if y_pred >= threshold else 0 for y_pred in Y_pred]
    
    def predict_prob(self,X):
        X = np.array(X)
        Z = np.dot(X, self.w) + self.b
        return self.sigmoid(Z)


    def get_params(self):
        return self.w, self.b

In [None]:
X = df.drop(columns=['Purchased'])
y = df['Purchased']
print(pd.Series(y).value_counts())

In [None]:
encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first')
X_cat = encoder.fit_transform(X[categorical])
X_num = X[numerical]
X = np.hstack([X_num, X_cat])


In [None]:

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
from imblearn.over_sampling import SMOTE # balance the data
smote = SMOTE(sampling_strategy='minority')
X_train_scaled, y_train_scaled = smote.fit_resample(X_train_scaled,y_train)

In [None]:
logR_model = LogisticRegression(epochs=3000)
logR_model.fit(X_train_scaled,y_train_scaled)

In [None]:
y_pred = logR_model.predict(X_val_scaled)
print(f'Accuracy: {accuracy_score(y_val,y_pred)}')
print(classification_report(y_val,y_pred,zero_division=0))


In [None]:
cm = confusion_matrix(y_val,y_pred)
sns.heatmap(cm,annot=True,fmt='d',cmap='viridis',xticklabels=['No','Yes'],yticklabels=['No','Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
y_prob = logR_model.predict_prob(X_val_scaled)
precisions,recalls,thresholds =  precision_recall_curve(y_val,y_prob)
f1s = 2 * (precisions * recalls) / (precisions + recalls + 1e-20)

best_threshold = thresholds[f1s.argmax()]

plt.figure(figsize=(10,8))
plt.plot(thresholds,precisions[:-1],c='red',label = 'Precision')
plt.plot(thresholds,recalls[:-1],label= 'Recall')
plt.plot(thresholds,f1s[:-1],label = 'F1 Score', c='orange')
plt.axvline(x=best_threshold,c='green',linestyle='--',label=f'Best Threshold: {best_threshold:.2f}')
plt.legend()
plt.grid(True)
plt.title('Precision - Recall Tradeoff over Threshold ')
plt.show()


In [None]:
fpr, tpr, thresholds = roc_curve(y_val, y_prob)

auc_score = roc_auc_score(y_val, y_prob)


plt.figure(figsize=(10,8))
plt.plot(fpr,tpr,color='blue',label=f'ROC Curve: AUC= {auc_score:2f}')
plt.plot([0,1],[0,1],linestyle='--',label='Random classifier line',c='k')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend()
plt.grid(True)
plt.show()