In [72]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Concatenate
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

data = pd.read_csv('/content/final_balanced_fake_review_dataset.csv')
data

Unnamed: 0,review_id,user_id,user_name,product_id,product_name,review_text,rating,timestamp,ip_address,verified_purchase,label,burst_reviews,copy_paste_review,likely_bot,fake_account
0,f71d59ff-843e-4fe1-9c63-63356704ccae,3036,Anna Atkins,14fbf688-b685-4eb0-99e8-382d3a0db792,Impact Pro,I recently bought the Impact Pro and I'm quite...,5,2025-07-01 05:03:12,1.100.41.212,True,genuine,False,False,False,False
1,203a7491-3684-4053-96c6-5938a7716b93,5091,Tami Wilson,24c6b085-7389-4e92-9191-45c2986b724d,Call Pro,I recently bought the Call Pro and I'm quite i...,5,2025-07-14 05:20:46,1.100.41.212,True,genuine,False,False,False,False
2,cb4fd78f-3b8b-42ea-83e3-c8e587782804,1300,Adam King,92c38aa2-477a-47ce-8b86-085bccdafe48,Tax Pro,The Tax Pro has exceeded my expectations in ev...,4,2025-07-17 18:20:53,1.100.41.212,True,genuine,False,False,False,True
3,09309a25-1225-4bba-92ed-436835b4ca79,5741,Amy Ball,2a259243-b099-4f21-9211-b50d9721a042,Perform Pro,I’ve been using the Perform Pro for a few week...,5,2025-08-04 10:14:08,1.100.41.212,True,genuine,False,False,False,False
4,27a3815e-f5a4-425b-b068-6189b831ba3f,1455,Regina Bonilla,9096159b-17a7-48d4-8aaf-c2ff6013fedc,Gas Pro,I’ve been using the Gas Pro for a few weeks an...,4,2025-08-04 13:01:06,1.100.41.212,True,genuine,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,441f22c5-3c41-4420-9c2b-b0b4cbb354bc,2051,Ryan Russell,6d109879-8a95-438b-bde6-23de7f81dc24,Onto Pro,My experience with the Onto Pro has been fanta...,4,2025-07-01 08:14:25,99.99.72.202,True,genuine,False,False,True,False
14996,b748c024-f658-47b4-a58d-664856fd0875,7291,Kristen Duran,5c4180b3-e0ab-4028-aadd-d95beaa6c4a6,Show Pro,The Show Pro has exceeded my expectations in e...,5,2025-07-17 01:42:39,99.99.72.202,True,genuine,False,False,False,False
14997,9f1cf67b-add6-4ffe-98b0-c7a1deebe219,1879,Brooke Fowler,6d109879-8a95-438b-bde6-23de7f81dc24,Onto Pro,I recently bought the Onto Pro and I'm quite i...,4,2025-07-24 18:22:42,99.99.72.202,True,genuine,False,True,False,False
14998,13bb57ce-2b24-4112-bc2a-bd4d1ea28722,1022,Alex Lane,4424802f-da14-4d83-8662-516a0c679c59,Artist Pro,I recently bought the Artist Pro and I'm quite...,5,2025-07-26 21:43:45,99.99.72.202,True,genuine,False,False,False,False


In [73]:
# Convert timestamp to datetime
data["timestamp"] = pd.to_datetime(data["timestamp"])

#Time difference between consecutive reviews
data["time_diff"] = data["timestamp"].diff().dt.seconds.fillna(0)

#IP frequency count
data["ip_count"] = data.groupby("ip_address")["ip_address"].transform("count")

data

Unnamed: 0,review_id,user_id,user_name,product_id,product_name,review_text,rating,timestamp,ip_address,verified_purchase,label,burst_reviews,copy_paste_review,likely_bot,fake_account,time_diff,ip_count
0,f71d59ff-843e-4fe1-9c63-63356704ccae,3036,Anna Atkins,14fbf688-b685-4eb0-99e8-382d3a0db792,Impact Pro,I recently bought the Impact Pro and I'm quite...,5,2025-07-01 05:03:12,1.100.41.212,True,genuine,False,False,False,False,0.0,5
1,203a7491-3684-4053-96c6-5938a7716b93,5091,Tami Wilson,24c6b085-7389-4e92-9191-45c2986b724d,Call Pro,I recently bought the Call Pro and I'm quite i...,5,2025-07-14 05:20:46,1.100.41.212,True,genuine,False,False,False,False,1054.0,5
2,cb4fd78f-3b8b-42ea-83e3-c8e587782804,1300,Adam King,92c38aa2-477a-47ce-8b86-085bccdafe48,Tax Pro,The Tax Pro has exceeded my expectations in ev...,4,2025-07-17 18:20:53,1.100.41.212,True,genuine,False,False,False,True,46807.0,5
3,09309a25-1225-4bba-92ed-436835b4ca79,5741,Amy Ball,2a259243-b099-4f21-9211-b50d9721a042,Perform Pro,I’ve been using the Perform Pro for a few week...,5,2025-08-04 10:14:08,1.100.41.212,True,genuine,False,False,False,False,57195.0,5
4,27a3815e-f5a4-425b-b068-6189b831ba3f,1455,Regina Bonilla,9096159b-17a7-48d4-8aaf-c2ff6013fedc,Gas Pro,I’ve been using the Gas Pro for a few weeks an...,4,2025-08-04 13:01:06,1.100.41.212,True,genuine,True,False,False,False,10018.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,441f22c5-3c41-4420-9c2b-b0b4cbb354bc,2051,Ryan Russell,6d109879-8a95-438b-bde6-23de7f81dc24,Onto Pro,My experience with the Onto Pro has been fanta...,4,2025-07-01 08:14:25,99.99.72.202,True,genuine,False,False,True,False,4617.0,5
14996,b748c024-f658-47b4-a58d-664856fd0875,7291,Kristen Duran,5c4180b3-e0ab-4028-aadd-d95beaa6c4a6,Show Pro,The Show Pro has exceeded my expectations in e...,5,2025-07-17 01:42:39,99.99.72.202,True,genuine,False,False,False,False,62894.0,5
14997,9f1cf67b-add6-4ffe-98b0-c7a1deebe219,1879,Brooke Fowler,6d109879-8a95-438b-bde6-23de7f81dc24,Onto Pro,I recently bought the Onto Pro and I'm quite i...,4,2025-07-24 18:22:42,99.99.72.202,True,genuine,False,True,False,False,60003.0,5
14998,13bb57ce-2b24-4112-bc2a-bd4d1ea28722,1022,Alex Lane,4424802f-da14-4d83-8662-516a0c679c59,Artist Pro,I recently bought the Artist Pro and I'm quite...,5,2025-07-26 21:43:45,99.99.72.202,True,genuine,False,False,False,False,12063.0,5


In [74]:
scaler = MinMaxScaler()
data[["time_diff", "ip_count"]] = scaler.fit_transform(data[["time_diff", "ip_count"]])

data

Unnamed: 0,review_id,user_id,user_name,product_id,product_name,review_text,rating,timestamp,ip_address,verified_purchase,label,burst_reviews,copy_paste_review,likely_bot,fake_account,time_diff,ip_count
0,f71d59ff-843e-4fe1-9c63-63356704ccae,3036,Anna Atkins,14fbf688-b685-4eb0-99e8-382d3a0db792,Impact Pro,I recently bought the Impact Pro and I'm quite...,5,2025-07-01 05:03:12,1.100.41.212,True,genuine,False,False,False,False,0.000000,0.075472
1,203a7491-3684-4053-96c6-5938a7716b93,5091,Tami Wilson,24c6b085-7389-4e92-9191-45c2986b724d,Call Pro,I recently bought the Call Pro and I'm quite i...,5,2025-07-14 05:20:46,1.100.41.212,True,genuine,False,False,False,False,0.012199,0.075472
2,cb4fd78f-3b8b-42ea-83e3-c8e587782804,1300,Adam King,92c38aa2-477a-47ce-8b86-085bccdafe48,Tax Pro,The Tax Pro has exceeded my expectations in ev...,4,2025-07-17 18:20:53,1.100.41.212,True,genuine,False,False,False,True,0.541754,0.075472
3,09309a25-1225-4bba-92ed-436835b4ca79,5741,Amy Ball,2a259243-b099-4f21-9211-b50d9721a042,Perform Pro,I’ve been using the Perform Pro for a few week...,5,2025-08-04 10:14:08,1.100.41.212,True,genuine,False,False,False,False,0.661987,0.075472
4,27a3815e-f5a4-425b-b068-6189b831ba3f,1455,Regina Bonilla,9096159b-17a7-48d4-8aaf-c2ff6013fedc,Gas Pro,I’ve been using the Gas Pro for a few weeks an...,4,2025-08-04 13:01:06,1.100.41.212,True,genuine,True,False,False,False,0.115950,0.075472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,441f22c5-3c41-4420-9c2b-b0b4cbb354bc,2051,Ryan Russell,6d109879-8a95-438b-bde6-23de7f81dc24,Onto Pro,My experience with the Onto Pro has been fanta...,4,2025-07-01 08:14:25,99.99.72.202,True,genuine,False,False,True,False,0.053438,0.075472
14996,b748c024-f658-47b4-a58d-664856fd0875,7291,Kristen Duran,5c4180b3-e0ab-4028-aadd-d95beaa6c4a6,Show Pro,The Show Pro has exceeded my expectations in e...,5,2025-07-17 01:42:39,99.99.72.202,True,genuine,False,False,False,False,0.727948,0.075472
14997,9f1cf67b-add6-4ffe-98b0-c7a1deebe219,1879,Brooke Fowler,6d109879-8a95-438b-bde6-23de7f81dc24,Onto Pro,I recently bought the Onto Pro and I'm quite i...,4,2025-07-24 18:22:42,99.99.72.202,True,genuine,False,True,False,False,0.694487,0.075472
14998,13bb57ce-2b24-4112-bc2a-bd4d1ea28722,1022,Alex Lane,4424802f-da14-4d83-8662-516a0c679c59,Artist Pro,I recently bought the Artist Pro and I'm quite...,5,2025-07-26 21:43:45,99.99.72.202,True,genuine,False,False,False,False,0.139620,0.075472


In [75]:
length = 0
for val in data["review_text"]:
    length = max(length, len(val.split()))
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data["review_text"])
X_text = pad_sequences(tokenizer.texts_to_sequences(data["review_text"]), maxlen=length)

X_text

array([[  0,   0,   0, ..., 276, 179, 784],
       [  0,   0,   0, ...,  11, 493, 308],
       [  0,   0,   0, ...,  43, 633, 883],
       ...,
       [  0,   0,   0, ..., 425, 133, 454],
       [  0,   0,   0, ..., 271, 819, 484],
       [  0,   0,   0, ..., 463, 763, 159]], dtype=int32)

In [76]:
X_extra = np.array(data[["time_diff", "ip_count"]])
X_extra

array([[0.        , 0.0754717 ],
       [0.01219922, 0.0754717 ],
       [0.54175396, 0.0754717 ],
       ...,
       [0.6944872 , 0.0754717 ],
       [0.13961967, 0.0754717 ],
       [0.46097756, 0.0754717 ]])

In [77]:
data['label'] = data['label'].astype(str)
for val in range(len(data['label'])):
    if data["label"][val] == 'fake':
        data.loc[val, "label"] = 1
    else:
        data.loc[val, "label"] = 0
y = np.array(data["label"], dtype= 'int64')
y

array([0, 0, 0, ..., 0, 0, 0])

In [78]:
X_train_text, X_test_text, X_train_extra, X_test_extra, y_train, y_test = train_test_split(
    X_text, X_extra, y, test_size=0.2, random_state=42)



In [79]:
text_input = tf.keras.layers.Input(shape=(length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50)(text_input)
lstm_layer = LSTM(64)(embedding_layer)

extra_input = tf.keras.layers.Input(shape=(2,))
merged = Concatenate()([lstm_layer, extra_input])

dense1 = Dense(64, activation="relu")(merged)
dropout1 = Dropout(0.3)(dense1)
output_layer = Dense(1, activation="sigmoid")(dropout1)

model = tf.keras.Model(inputs=[text_input, extra_input], outputs=output_layer)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])



In [80]:
model.fit([X_train_text, X_train_extra], y_train, epochs=1, batch_size=4, validation_split=0.2)



[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 21ms/step - accuracy: 0.9803 - loss: 0.0495 - val_accuracy: 1.0000 - val_loss: 3.7390e-07


<keras.src.callbacks.history.History at 0x7c24a7657310>

In [81]:
loss, accuracy = model.evaluate([X_test_text, X_test_extra], y_test)
print("Test Accuracy:", accuracy)


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 1.0000 - loss: 3.8069e-07
Test Accuracy: 1.0


In [82]:
predictions = model.predict([X_test_text, X_test_extra])

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step


In [83]:
binary_predictions = (predictions > 0.5).astype(int)

In [84]:
print('predicted values', binary_predictions.flatten())
print('actual values', y_test)

predicted values [0 0 0 ... 0 0 0]
actual values [0 0 0 ... 0 0 0]


In [85]:
model.save('fake_review.h5')



In [86]:
data['burst_reviews'] = data['burst_reviews'].astype(str)
for val in range(len(data['burst_reviews'])):
    if data["burst_reviews"][val] == 'True':
        data.loc[val, "burst_reviews"] = 1
    else:
        data.loc[val, "burst_reviews"] = 0
y_br = np.array(data["burst_reviews"], dtype= 'int64')
y_br

array([0, 0, 0, ..., 0, 0, 0])

In [87]:
text_input = tf.keras.layers.Input(shape=(length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50)(text_input)
lstm_layer = LSTM(64)(embedding_layer)

extra_input = tf.keras.layers.Input(shape=(2,))
merged = Concatenate()([lstm_layer, extra_input])

dense1 = Dense(64, activation="relu")(merged)
dropout1 = Dropout(0.3)(dense1)
output_layer = Dense(1, activation="sigmoid")(dropout1)

model_br = tf.keras.Model(inputs=[text_input, extra_input], outputs=output_layer)
model_br.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])



In [88]:
X_train_text, X_test_text, X_train_extra, X_test_extra, y_train, y_test = train_test_split(
    X_text, X_extra, y_br, test_size=0.2, random_state=42)



In [89]:
model_br.fit([X_train_text, X_train_extra], y_train, epochs=1, batch_size=4, validation_split=0.2)



[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 20ms/step - accuracy: 0.8863 - loss: 0.3615 - val_accuracy: 0.9004 - val_loss: 0.3258


<keras.src.callbacks.history.History at 0x7c24a75d5610>

In [90]:
model_br.save('burst_review.h5')



In [91]:
data['copy_paste_review'] = data['copy_paste_review'].astype(str)
for val in range(len(data['copy_paste_review'])):
    if data["copy_paste_review"][val] == 'True':
        data.loc[val, "copy_paste_review"] = 1
    else:
        data.loc[val, "copy_paste_review"] = 0
y_cpr = np.array(data["copy_paste_review"], dtype= 'int64')
y_cpr

array([0, 0, 0, ..., 1, 0, 0])

In [92]:
text_input = tf.keras.layers.Input(shape=(length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50)(text_input)
lstm_layer = LSTM(64)(embedding_layer)

extra_input = tf.keras.layers.Input(shape=(2,))
merged = Concatenate()([lstm_layer, extra_input])

dense1 = Dense(64, activation="relu")(merged)
dropout1 = Dropout(0.3)(dense1)
output_layer = Dense(1, activation="sigmoid")(dropout1)

model_cpr = tf.keras.Model(inputs=[text_input, extra_input], outputs=output_layer)
model_cpr.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])



In [93]:
X_train_text, X_test_text, X_train_extra, X_test_extra, y_train, y_test = train_test_split(
    X_text, X_extra, y_cpr, test_size=0.2, random_state=42)



In [94]:
model_cpr.fit([X_train_text, X_train_extra], y_train, epochs=1, batch_size=4, validation_split=0.2)



[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 20ms/step - accuracy: 0.8985 - loss: 0.3427 - val_accuracy: 0.8929 - val_loss: 0.3413


<keras.src.callbacks.history.History at 0x7c24a9099950>

In [95]:
model_cpr.save('copy-paste_review.h5')



In [96]:
data['likely_bot'] = data['likely_bot'].astype(str)
for val in range(len(data['likely_bot'])):
    if data["likely_bot"][val] == 'True':
        data.loc[val, "likely_bot"] = 1
    else:
        data.loc[val, "likely_bot"] = 0
y_lb = np.array(data["likely_bot"], dtype= 'int64')
y_lb

array([0, 0, 0, ..., 0, 0, 0])

In [97]:
text_input = tf.keras.layers.Input(shape=(length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50)(text_input)
lstm_layer = LSTM(64)(embedding_layer)

extra_input = tf.keras.layers.Input(shape=(2,))
merged = Concatenate()([lstm_layer, extra_input])

dense1 = Dense(64, activation="relu")(merged)
dropout1 = Dropout(0.3)(dense1)
output_layer = Dense(1, activation="sigmoid")(dropout1)

model_lb = tf.keras.Model(inputs=[text_input, extra_input], outputs=output_layer)
model_lb.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])



In [98]:
X_train_text, X_test_text, X_train_extra, X_test_extra, y_train, y_test = train_test_split(
    X_text, X_extra, y_lb, test_size=0.2, random_state=42)



In [None]:
model_lb.fit([X_train_text, X_train_extra], y_train, epochs=1, batch_size=4, validation_split=0.2)



[1m1116/2400[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m31s[0m 24ms/step - accuracy: 0.8777 - loss: 0.3726

In [None]:
model_lb.save('likely_bot.h5')

In [100]:
from datetime import datetime

class Seller:
    def _init_(self, name, join_date, revenue=0, violations=0, avg_rating=0,
                 review_count=0, verified_purchases=0, fake_reviews_detected=False):
        self.name = name
        self.join_date = datetime.strptime(join_date, "%Y-%m-%d")
        self.revenue = revenue
        self.violations = violations
        self.avg_rating = avg_rating
        self.review_count = review_count
        self.verified_purchases = verified_purchases
        self.fake_reviews_detected = fake_reviews_detected
        self.is_new = True if revenue < 100000 else False
        self.risk_score = 50.0

    def adjust_risk(self):
        # --- Core Logic ---
        self.risk_score += self.violations * 2.0

        if self.revenue >= 100000 and self.avg_rating >= 4.2:
            self.risk_score -= 5
            self.is_new = False

        # --- Additional Rules ---
        if self.review_count < 10:
            self.risk_score += 3

        if self.revenue > 150000 and self.verified_purchases < 5:
            self.risk_score += 5

        months_active = (datetime.now() - self.join_date).days // 30
        if months_active > 12 and self.violations == 0:
            self.risk_score -= 5

        if self.avg_rating < 3.0:
            self.risk_score += 5

        # --- Fake Reviews Detected ---
        if self.fake_reviews_detected:
            self.risk_score += 15  # Strong penalty

        self.risk_score = min(max(self.risk_score, 0), 100)

    def get_status(self):
        return {
            "name": self.name,
            "is_new": self.is_new,
            "risk_score": round(self.risk_score, 2),
            "risk_level": self.get_risk_level(),
            "revenue": f"${self.revenue/1000:.0f}K",
            "violations": self.violations,
            "rating": round(self.avg_rating, 2),
            "review_count": self.review_count,
            "verified_purchases": self.verified_purchases,
            "fake_reviews_detected": self.fake_reviews_detected
        }

    def get_risk_level(self):
        if self.risk_score >= 70:
            return "High"
        elif self.risk_score >= 40:
            return "Medium"
        else:
            return "Low"