### LIBRARY AND DATA IMPORTS

In [1]:
import pandas as pd
import numpy as np                            
import matplotlib.pyplot as plt          
import seaborn as sb  

In [2]:
raw_df = pd.read_csv("dataset/cleaned_data.csv")
raw_df.sample(3)

Unnamed: 0,prodID,main_category,ratings,no_of_ratings,discount_price,actual_price,name,link,image,isDarkPattern,diff_percent
65569,82047,car & motorbike,3.5,3,1480.0,2925.0,Able Zed Black Half Car Magnetic Sun Shade Cur...,https://www.amazon.in/Able-Black-Magnetic-Curt...,https://m.media-amazon.com/images/I/81XrcKVhGS...,0,49.401709
146246,220898,accessories,3.3,11,399.0,798.0,Kuber Industries Silk Clutch Ladies Handbag (G...,https://www.amazon.in/Kuber-Industries-Clutch-...,https://m.media-amazon.com/images/I/81x9wX0VI2...,0,50.0
171233,257362,"tv, audio & cameras",4.0,195,419.0,799.0,LipiWorld® 129 AC Remote Compatible for Whirlp...,https://www.amazon.in/LipiWorld%C2%AE-129-AC-C...,https://m.media-amazon.com/images/I/71YwfP4IQB...,0,47.559449


### DATA CLEANING

In [3]:
raw_df.sample(3)

Unnamed: 0,prodID,main_category,ratings,no_of_ratings,discount_price,actual_price,name,link,image,isDarkPattern,diff_percent
278515,441667,"tv, audio & cameras",3.9,21,2449.0,4999.0,Pebble Basswoods Heavy Bass 16W Bluetooth Speaker,https://www.amazon.in/Pebble-Basswoods-Heavy-B...,https://m.media-amazon.com/images/I/71NXU-Ghgb...,0,51.010202
114356,148371,women's clothing,3.4,164,269.0,1281.0,SIRIL Women's Silk Saree With blouse piece,https://www.amazon.in/SIRIL-Blouse-Piece-1397S...,https://m.media-amazon.com/images/I/71hBCTjVT0...,0,79.000781
154452,234144,"tv, audio & cameras",3.5,264,999.0,2999.0,SYSKA Sonic Buds IEB800 True Wireless Earbuds ...,https://www.amazon.in/SYSKA-Wireless-Technolog...,https://m.media-amazon.com/images/I/61gZW0gOO-...,1,66.688896


In [4]:
df2 = raw_df

In [5]:
# handle missing values and NaN values or datatype errors
# and changing str -> int ('123' -> 123)

# ratings column
df2['ratings'] = pd.to_numeric(df2['ratings'], errors='coerce')
df2['ratings'] = df2['ratings'].fillna(df2['ratings'].mean())
df2 = df2.astype({'ratings':'float'})

# no_of_ratings column
mask = pd.to_numeric(df2['no_of_ratings'], errors='coerce').notnull()
df2 = df2[mask]
df2 = df2.astype({'no_of_ratings':'int'})

# actual_price column
mask = pd.to_numeric(df2['actual_price'],errors='coerce').notnull()
df2 = df2[mask]
df2 = df2.astype({'actual_price':'float'})

# discount_price column
mask = pd.to_numeric(df2['discount_price'],errors='coerce').notnull()
df2 = df2[mask]
df2 = df2.astype({'discount_price':'float'})

In [6]:
df2 = df2.drop(columns=['main_category','name','link','image'])
df2.sample(3)

Unnamed: 0,prodID,ratings,no_of_ratings,discount_price,actual_price,isDarkPattern,diff_percent
259071,406168,3.5,220,579.0,999.0,0,42.042042
127198,164200,3.4,187,399.0,999.0,0,60.06006
88322,115737,3.6,33,549.0,1499.0,0,63.375584


In [7]:
# generating 'credibility_score' column

# normalizing 'ratings' between 0 and 1
normalized_ratings = (df2['ratings'] - df2['ratings'].min()) / (df2['ratings'].max() - df2['ratings'].min())
# normalizing 'num_of_ratings' between 0 and 1
normalized_num_ratings = (df2['no_of_ratings'] - df2['no_of_ratings'].min()) / (df2['no_of_ratings'].max() - df2['no_of_ratings'].min())
# custom weights for ratings and num_of_ratings
rating_weight = 0.6
num_rating_weight = 0.4
# Calculating 'credibility_score'
df2['credibility_score'] = (rating_weight * normalized_ratings + num_rating_weight * normalized_num_ratings) * 100
# adjusting the credibility_score to be within the range of 1 to 100
min_score = df2['credibility_score'].min()
max_score = df2['credibility_score'].max()
df2['credibility_score'] = 1 + (df2['credibility_score'] - min_score) * 99 / (max_score - min_score)

df2.sample(3)

Unnamed: 0,prodID,ratings,no_of_ratings,discount_price,actual_price,isDarkPattern,diff_percent,credibility_score
189387,282497,4.1,96,685.0,690.0,0,0.724638,52.442884
146215,220850,3.8,220,1149.0,2999.0,0,61.687229,47.474537
203525,320269,3.0,1,499.0,1499.0,0,66.711141,34.184358


In [8]:
df2['diff_amount'] = df2['actual_price'] - df2['discount_price']
df2.sample(3)

Unnamed: 0,prodID,ratings,no_of_ratings,discount_price,actual_price,isDarkPattern,diff_percent,credibility_score,diff_amount
69851,87878,3.9,14,2146.0,3399.0,0,36.863783,49.118294,1253.0
317496,529309,3.7,184,599.0,2699.0,0,77.806595,45.812617,2100.0
114859,149040,1.6,4,440.0,2199.0,0,79.990905,10.955532,1759.0


In [9]:
df2['isDarkPattern'].value_counts()

isDarkPattern
0    276560
1     58090
Name: count, dtype: int64

### MODEL TRAINING

In [11]:
# XGBoost model
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

features = ['actual_price', 'diff_percent', 'credibility_score']
target = 'isDarkPattern'  

X = df2[features]
y = df2[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.90
              precision    recall  f1-score   support

           0       0.90      0.98      0.94     55311
           1       0.83      0.50      0.63     11619

    accuracy                           0.90     66930
   macro avg       0.87      0.74      0.78     66930
weighted avg       0.89      0.90      0.89     66930



In [24]:
"""
Interpretation:
    The model achieved a high accuracy (90%), which suggests it performs well overall in making correct predictions.
    However, looking at precision and recall, especially for the '1' class (misleading prices), there seems to be a trade-off. 
    The precision is relatively high (83%) but the recall is lower (50%). This means that while the model is good at identifying 
    misleading prices when it predicts them, it misses quite a few actual misleading prices.
"""

"\nInterpretation:\n    The model achieved a high accuracy (90%), which suggests it performs well overall in making correct predictions.\n    However, looking at precision and recall, especially for the '1' class (misleading prices), there seems to be a trade-off. \n    The precision is relatively high (83%) but the recall is lower (50%). This means that while the model is good at identifying \n    misleading prices when it predicts them, it misses quite a few actual misleading prices.\n"

In [25]:
# Logistic Regression model
# Decision Tree classifier model
# Random Forest classifier model
# K Nearest Neighbors classifier model

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
}

for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"------ {name} ------")
    print(f"Accuracy: {accuracy:.4f}")
    report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(report)
    print("\n")

------ Logistic Regression ------
Accuracy: 0.8667
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     55311
           1       0.67      0.47      0.55     11619

    accuracy                           0.87     66930
   macro avg       0.78      0.71      0.74     66930
weighted avg       0.85      0.87      0.86     66930



------ Decision Tree ------
Accuracy: 0.8978
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94     55311
           1       0.72      0.68      0.70     11619

    accuracy                           0.90     66930
   macro avg       0.82      0.81      0.82     66930
weighted avg       0.90      0.90      0.90     66930



------ Random Forest ------
Accuracy: 0.9136
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     55311
           1       0.79

In [26]:
"""
Random forest is the highest accuracy we could achieve = 0.9134 
"""

'\nRandom forest is the highest accuracy we could achieve = 0.9134 \n'

In [27]:
# Neural Network - Multi Layer Perceptron model

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# experimenting with increasing neurons in the hidden layer
neurons_list = [10, 50, 100]  
for neurons in neurons_list:
    mlp_classifier = MLPClassifier(hidden_layer_sizes=(neurons,), activation='relu', solver='adam', random_state=42)
    mlp_classifier.fit(X_train, y_train)
    y_pred = mlp_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy ({neurons} neurons): {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")


Accuracy (10 neurons): 0.8551
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91     55311
           1       0.59      0.53      0.56     11619

    accuracy                           0.86     66930
   macro avg       0.75      0.73      0.74     66930
weighted avg       0.85      0.86      0.85     66930



Accuracy (50 neurons): 0.8564
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.92      0.91     55311
           1       0.59      0.55      0.57     11619

    accuracy                           0.86     66930
   macro avg       0.75      0.74      0.74     66930
weighted avg       0.85      0.86      0.85     66930



Accuracy (100 neurons): 0.8778
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93     55311
           1       0.73      0.47      0.57     11619

    accuracy      

In [28]:
# Support Vector Machine model

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

svm_classifier = SVC(kernel='rbf', random_state=42)  
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Model Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

SVM Model Accuracy: 0.8264
Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90     55311
           1       0.00      0.00      0.00     11619

    accuracy                           0.83     66930
   macro avg       0.41      0.50      0.45     66930
weighted avg       0.68      0.83      0.75     66930



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# Train Test data split
features = ['discount_price', 'diff_percent']
target = 'isDarkPattern'  

X = df2[features]
y = df2[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
# Tuned Random Forest 
# Tuned Gradient Boosting 
# Tuned Logistic Regression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, classification_report

print("X_train columns:", X_train.columns)
print("X_test columns:", X_test.columns)

# feature engineering
X_train['interaction_feature'] = X_train['discount_price'] * X_train['diff_percent']
X_test['interaction_feature'] = X_test['discount_price'] * X_test['diff_percent']

# feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# hyperparameter tuning for random forest
rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, n_jobs=-1)
rf_grid_search.fit(X_train_scaled, y_train)
best_rf_classifier = rf_grid_search.best_estimator_

# hyperparameter tuning for gradient boosting
gb_params = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3, n_jobs=-1)
gb_grid_search.fit(X_train_scaled, y_train)
best_gb_classifier = gb_grid_search.best_estimator_

# hyperparameter tuning for logistic regression
lr_params = {'C': [0.01, 0.1]}

lr_grid_search = GridSearchCV(LogisticRegression(random_state=42), lr_params, cv=3, n_jobs=-1)
lr_grid_search.fit(X_train_scaled, y_train)
best_lr_classifier = lr_grid_search.best_estimator_


tuned_models = {
    'Tuned Random Forest': best_rf_classifier,
    'Tuned Gradient Boosting': best_gb_classifier,
    'Tuned Logistic Regression': best_lr_classifier
}

for model_name, model in tuned_models.items():
    predictions = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    
    print(f'{model_name} Metrics:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(classification_report(y_test, predictions))
    print('-' * 40)

X_train columns: Index(['discount_price', 'diff_percent'], dtype='object')
X_test columns: Index(['discount_price', 'diff_percent'], dtype='object')
Tuned Random Forest Metrics:
Accuracy: 0.8977
Precision: 0.7834
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     55311
           1       0.78      0.57      0.66     11619

    accuracy                           0.90     66930
   macro avg       0.85      0.77      0.80     66930
weighted avg       0.89      0.90      0.89     66930

----------------------------------------
Tuned Gradient Boosting Metrics:
Accuracy: 0.8912
Precision: 0.8688
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     55311
           1       0.87      0.44      0.58     11619

    accuracy                           0.89     66930
   macro avg       0.88      0.71      0.76     66930
weighted avg       0.89      0.89      0.88     66930

------------------------

In [31]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, classification_report

# feature engineering
X_train['interaction_feature'] = X_train['discount_price'] * X_train['diff_percent']
X_test['interaction_feature'] = X_test['discount_price'] * X_test['diff_percent']

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
predictions = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)

print(f'Random Forest Metrics:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(classification_report(y_test, predictions))


Random Forest Metrics:
Accuracy: 0.8978
Precision: 0.7813
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     55311
           1       0.78      0.57      0.66     11619

    accuracy                           0.90     66930
   macro avg       0.85      0.77      0.80     66930
weighted avg       0.89      0.90      0.89     66930



In [32]:
# Tuned Gradient Boosting Model

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, classification_report

# feature engineering
X_train['interaction_feature'] = X_train['discount_price'] * X_train['diff_percent']
X_test['interaction_feature'] = X_test['discount_price'] * X_test['diff_percent']
# feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# hyperparameter tuning
gb_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_params, cv=3, n_jobs=-1)
gb_grid_search.fit(X_train_scaled, y_train)
best_gb_classifier = gb_grid_search.best_estimator_

predictions = best_gb_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)

print(f'Tuned Gradient Boosting Metrics:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(classification_report(y_test, predictions))


Tuned Gradient Boosting Metrics:
Accuracy: 0.8959
Precision: 0.8418
              precision    recall  f1-score   support

           0       0.90      0.98      0.94     55311
           1       0.84      0.49      0.62     11619

    accuracy                           0.90     66930
   macro avg       0.87      0.74      0.78     66930
weighted avg       0.89      0.90      0.88     66930



In [33]:
"""
So far the best results have come from the 'TUNED GRADIENT BOOSTING CLASSIFIER'
Accuracy: 0.8959
Precision: 0.8418

"""

"\nSo far the best results have come from the 'TUNED GRADIENT BOOSTING CLASSIFIER'\nAccuracy: 0.8912\nPrecision: 0.8688\n\n"

In [19]:
# GENERIC RANDOM FOREST Model

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score

X = df2[['actual_price', 'discount_price', 'diff_percent']]
y = df2['isDarkPattern']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Random Forest Model:")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)

Random Forest Model:
Accuracy: 0.8979979082623637
Confusion Matrix:
 [[53465  1846]
 [ 4981  6638]]

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.97      0.94     55311
           1       0.78      0.57      0.66     11619

    accuracy                           0.90     66930
   macro avg       0.85      0.77      0.80     66930
weighted avg       0.89      0.90      0.89     66930

F2 Score: 0.6038937409024745


In [10]:
# OPTIMIZED RANDOM FOREST Model

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer

X = df2[['diff_percent','actual_price']]
y = df2['isDarkPattern']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# hyperparameter tuning
best_params = {
    'n_estimators': 100,
    'max_depth': 5,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'bootstrap': True,
    'max_samples': 0.9,
    'max_features': None,  
    'max_leaf_nodes': 30
}

optimized_random_forest_model = RandomForestClassifier(random_state=42, **best_params)
optimized_random_forest_model.fit(X_train, y_train)

y_pred = optimized_random_forest_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Optimized Random Forest Model:")
print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_rep)



Optimized Random Forest Model:
Best Hyperparameters: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True, 'max_samples': 0.9, 'max_features': None, 'max_leaf_nodes': 30}
Accuracy: 0.8903331839235021
Confusion Matrix:
 [[54604   707]
 [ 6633  4986]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.99      0.94     55311
           1       0.88      0.43      0.58     11619

    accuracy                           0.89     66930
   macro avg       0.88      0.71      0.76     66930
weighted avg       0.89      0.89      0.87     66930



The model chosen finally was **Optimized Random Forest**. 
Because it gives a good balance between accuracy and precision.

### SERIALIZATION

In [35]:
import pickle

pickle.dump(optimized_random_forest_model,open("ORFmodel.pkl","wb"))