In [1]:
import pandas as pd

# New Section

In [2]:
df = pd.read_csv("Vendor_Ratings_Dataset.csv")
df.head()

Unnamed: 0,user_id,vendor_id,rating,timestamp,review
0,user_1,vendor_6,4.3,2025-04-24 19:46:31,Review text vunlrgwqgabls dcolulppf
1,user_1,vendor_8,3.9,2025-05-18 19:46:31,Review text r kqbkpklmvfdraz ujwtocb tibrxefi
2,user_1,vendor_1,4.5,2025-03-27 19:46:31,Review text grxib lfsxdmah ovqalj kjdouzfcllst...
3,user_1,vendor_8,5.0,2025-04-15 19:46:31,Review text ccfwannvycnz mhqtcnsbumeocvcnchcns...
4,user_1,vendor_4,3.8,2025-05-15 19:46:31,Review text dexzzlxktibf yiprwhumy uwq


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("Vendor_Ratings_Dataset.csv")

# (Optional) Convert timestamp to datetime if needed
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Let's say you want to predict "rating" using the other columns
X = df.drop(columns=['rating'])  # Features (user_id, vendor_id, review, timestamp)
y = df['rating']                 # Target (rating)

# Perform train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

# Display results
print("✅ X_train shape:", X_train.shape)
print("✅ X_test shape:", X_test.shape)
print("✅ y_train shape:", y_train.shape)
print("✅ y_test shape:", y_test.shape)

✅ X_train shape: (1624, 4)
✅ X_test shape: (406, 4)
✅ y_train shape: (1624,)
✅ y_test shape: (406,)


In [4]:
import pandas as pd
from datetime import datetime, timedelta
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder

# ----------- Step 1: Sample Data (Replace this with your real data) -----------
historical_reviews = pd.DataFrame([
    {'user_id': 'user_1', 'vendor_id': 'vendor_123', 'rating': 4.8, 'timestamp': datetime.now() - timedelta(days=3), 'user_created': datetime.now() - timedelta(days=90)},
    {'user_id': 'user_2', 'vendor_id': 'vendor_123', 'rating': 4.7, 'timestamp': datetime.now() - timedelta(days=2), 'user_created': datetime.now() - timedelta(days=115)},
    {'user_id': 'user_3', 'vendor_id': 'vendor_123', 'rating': 4.9, 'timestamp': datetime.now() - timedelta(days=1), 'user_created': datetime.now() - timedelta(days=100)},
])

new_reviews = pd.DataFrame([
    {'user_id': 'user_4', 'vendor_id': 'vendor_123', 'rating': 1.0, 'timestamp': datetime.now(), 'user_created': datetime.now() - timedelta(days=1)},
    {'user_id': 'user_5', 'vendor_id': 'vendor_123', 'rating': 1.0, 'timestamp': datetime.now(), 'user_created': datetime.now() - timedelta(days=2)},
    {'user_id': 'user_6', 'vendor_id': 'vendor_123', 'rating': 1.0, 'timestamp': datetime.now(), 'user_created': datetime.now() - timedelta(days=1)},
])

# ----------- Step 2: Combine and Prepare -----------
df = pd.concat([historical_reviews, new_reviews], ignore_index=True)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['user_created'] = pd.to_datetime(df['user_created'])
df['account_age'] = (df['timestamp'] - df['user_created']).dt.days
df['hour'] = df['timestamp'].dt.hour

# Encode vendor_id
le = LabelEncoder()
df['vendor_encoded'] = le.fit_transform(df['vendor_id'])

# ----------- Step 3: Fit Isolation Forest -----------
features = df[['rating', 'account_age', 'hour', 'vendor_encoded']]
iso_model = IsolationForest(n_estimators=100, contamination=0.15, random_state=42)
df['anomaly_score'] = iso_model.fit_predict(features)

# ----------- Step 4: Label Anomalies -----------
df['is_anomaly'] = df['anomaly_score'] == -1

# ----------- Step 5: Optional Output (Comment out to hide) -----------
anomalies = df[df['is_anomaly']]
print("🚨 Anomalies Detected:\n", anomalies[['user_id', 'vendor_id', 'rating', 'account_age', 'hour']])


🚨 Anomalies Detected:
   user_id   vendor_id  rating  account_age  hour
4  user_5  vendor_123     1.0            2    13




In [5]:
!pip install scikit-learn



In [6]:
#Detect Point Anomalies
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Step 1: Generate synthetic vendor rating data
def generate_vendor_rating_data():
    vendors = [f'vendor_{i+1}' for i in range(5)]
    users = [f'user_{i+1}' for i in range(50)]
    data = []

    # Generate normal ratings
    for _ in range(500):
        data.append({
            'user_id': random.choice(users),
            'vendor_id': random.choice(vendors),
            'rating': round(np.clip(np.random.normal(4.5, 0.3), 1, 5), 1),
            'timestamp': datetime.now() - timedelta(days=random.randint(0, 30), hours=random.randint(0, 23)),
            'review': "normal review"
        })

    # Inject point anomaly: sudden cluster of 1-star ratings for vendor_3
    anomaly_time = datetime.now().replace(minute=0, second=0, microsecond=0)
    for _ in range(10):
        data.append({
            'user_id': random.choice(users),
            'vendor_id': 'vendor_3',
            'rating': 1.0,
            'timestamp': anomaly_time,
            'review': "suspicious 1-star review"
        })

    return pd.DataFrame(data)

# Step 2: Load or generate the dataset
df_ratings = generate_vendor_rating_data()
df_ratings['timestamp'] = pd.to_datetime(df_ratings['timestamp'])

# Step 3: Detect point anomalies for vendor_3
def detect_point_anomalies(df, vendor_id='vendor_3', threshold_rating=2.0, min_ratings=5):
    vendor_df = df[df['vendor_id'] == vendor_id].copy()
    vendor_df['hour'] = vendor_df['timestamp'].dt.floor('H')

    # Group by hour
    hourly_stats = vendor_df.groupby('hour').agg(
        avg_rating=('rating', 'mean'),
        rating_count=('rating', 'count')
    ).reset_index()

    # Detect anomalies: low average + many ratings
    hourly_stats['is_point_anomaly'] = (hourly_stats['avg_rating'] < threshold_rating) & \
                                       (hourly_stats['rating_count'] >= min_ratings)

    anomalies = hourly_stats[hourly_stats['is_point_anomaly']]
    return anomalies

# Step 4: Run detection
anomalies_detected = detect_point_anomalies(df_ratings, vendor_id='vendor_3')

# Step 5: Display results
if not anomalies_detected.empty:
    print("Detected Point Anomalies for vendor_3:\n")
    print(anomalies_detected[['hour', 'avg_rating', 'rating_count']])
else:
    print("No point anomalies detected for vendor_3.")


Detected Point Anomalies for vendor_3:

                  hour  avg_rating  rating_count
98 2025-07-19 13:00:00         1.0            10


In [7]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Step 1: Generate data with consistent ratings except on weekends
def generate_contextual_data():
    vendors = ['vendor_1']
    users = [f'user_{i+1}' for i in range(100)]
    data = []

    # Generate ratings across 30 days
    for day in range(30):
        for _ in range(10):  # 10 ratings per day
            date = datetime.now() - timedelta(days=day)
            rating = np.random.normal(4.5, 0.3)

            # Inject lower ratings only on weekends
            if date.weekday() in [5, 6]:  # Saturday or Sunday
                rating = np.random.normal(2.0, 0.5)

            data.append({
                'user_id': random.choice(users),
                'vendor_id': 'vendor_1',
                'rating': round(np.clip(rating, 1.0, 5.0), 1),
                'timestamp': date
            })

    return pd.DataFrame(data)

# Step 2: Prepare data
df_ratings = generate_contextual_data()
df_ratings['timestamp'] = pd.to_datetime(df_ratings['timestamp'])
df_ratings['day_of_week'] = df_ratings['timestamp'].dt.dayofweek  # 0=Monday, 6=Sunday
df_ratings['is_weekend'] = df_ratings['day_of_week'].isin([5, 6])

# Step 3: Analyze weekend vs weekday trends
vendor_df = df_ratings[df_ratings['vendor_id'] == 'vendor_1']

# Average rating on weekdays and weekends
weekday_avg = vendor_df[~vendor_df['is_weekend']]['rating'].mean()
weekend_avg = vendor_df[vendor_df['is_weekend']]['rating'].mean()

# Step 4: Contextual anomaly detection logic
threshold_drop = 1.0  # if weekend avg is 1.0 lower than weekday avg
is_contextual_anomaly = (weekday_avg - weekend_avg) >= threshold_drop

# Step 5: Output
print("📊 Contextual Anomaly Check (Weekend Rating Drop):")
print(f"Average Weekday Rating: {weekday_avg:.2f}")
print(f"Average Weekend Rating: {weekend_avg:.2f}")
if is_contextual_anomaly:
    print("⚠️  Contextual Anomaly Detected: Weekend ratings are unusually low!")
else:
    print("✅ No contextual anomaly detected.")


📊 Contextual Anomaly Check (Weekend Rating Drop):
Average Weekday Rating: 4.51
Average Weekend Rating: 2.00
⚠️  Contextual Anomaly Detected: Weekend ratings are unusually low!


In [8]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from difflib import SequenceMatcher

# Step 1: Generate data
def generate_collective_data():
    vendors = ['vendor_1', 'vendor_2']
    users = [f'user_{i+1}' for i in range(100)]
    data = []

    # Normal reviews
    for _ in range(300):
        created = datetime.now() - timedelta(days=random.randint(10, 100))
        data.append({
            'user_id': random.choice(users),
            'vendor_id': random.choice(vendors),
            'rating': round(np.clip(np.random.normal(4.0, 0.5), 1, 5), 1),
            'timestamp': datetime.now() - timedelta(days=random.randint(1, 30)),
            'user_created': created,
            'review': random.choice(["great service", "excellent", "will buy again", "very satisfied"])
        })

    # Inject fake reviews: new users, same vendor, 5-stars, similar comments
    for i in range(10):
        user_id = f"fake_user_{i+1}"
        created = datetime.now() - timedelta(days=random.randint(0, 3))  # new account
        data.append({
            'user_id': user_id,
            'vendor_id': 'vendor_1',
            'rating': 5.0,
            'timestamp': datetime.now() - timedelta(days=1),
            'user_created': created,
            'review': "awesome vendor, very good quality"
        })

    return pd.DataFrame(data)

# Step 2: Prepare data
df = generate_collective_data()
df['user_created'] = pd.to_datetime(df['user_created'])
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['account_age_days'] = (df['timestamp'] - df['user_created']).dt.days

# Step 3: Filter for collective pattern
# - Account age < 7 days
# - Rating = 5
# - Same vendor
# - Similar review text
new_user_reviews = df[(df['account_age_days'] < 7) & (df['rating'] == 5.0)]

# Group by vendor
suspicious_groups = []
for vendor, group in new_user_reviews.groupby('vendor_id'):
    reviews = group['review'].tolist()
    similarity_count = 0
    for i in range(len(reviews)):
        for j in range(i + 1, len(reviews)):
            sim = SequenceMatcher(None, reviews[i], reviews[j]).ratio()
            if sim > 0.85:
                similarity_count += 1
    if similarity_count >= 5:  # 5+ similar pairs = suspicious
        suspicious_groups.append({
            'vendor_id': vendor,
            'review_count': len(group),
            'similar_reviews': similarity_count
        })

# Step 4: Output
if suspicious_groups:
    print("🚨 Collective Anomalies Detected:\n")
    for group in suspicious_groups:
        print(f"Vendor: {group['vendor_id']}, 5-star reviews from new users: {group['review_count']}, Similar Comments: {group['similar_reviews']}")
else:
    print("✅ No collective anomalies detected.")


🚨 Collective Anomalies Detected:

Vendor: vendor_1, 5-star reviews from new users: 12, Similar Comments: 45


In [9]:
!pip install mysql-connector-python



In [18]:
import pandas as pd
import pyodbc
import mysql.connector
from datetime import datetime, timedelta
import random

# ------------------ DB Connection ------------------
import os
import mysql.connector

db = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
    database=os.getenv("DB_NAME")
)
cursor = db.cursor()
# ------------------ Fetch Users and Vendors ------------------
users = pd.read_sql("SELECT user_id, created_at FROM users", conn)
vendors = pd.read_sql("SELECT vendor_id FROM vendor", conn)




In [19]:
import pandas as pd
import mysql.connector
from datetime import datetime

# ---------- Connect to Database ----------
import os
import mysql.connector

db = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
    database=os.getenv("DB_NAME")
)
cursor = db.cursor()

# ---------- Fetch Ratings & User Info ----------
query = """
    SELECT 
        r.user_id,
        r.vendor_id,
        r.rating,
        r.timestamp,
        u.created_at AS account_creation_date
    FROM reviews r
    JOIN users u ON r.user_id = u.user_id
"""
cursor.execute(query)
records = cursor.fetchall()
cursor.close()
conn.close()

df = pd.DataFrame(records)

# ---------- Validate Column Names ----------
expected_cols = ['user_id', 'vendor_id', 'rating', 'timestamp', 'account_creation_date']
missing = [col for col in expected_cols if col not in df.columns]
if missing:
    print(f"❌ Missing columns in SQL result: {missing}")
else:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['account_creation_date'] = pd.to_datetime(df['account_creation_date'])

    # Compute account age
    df['account_age_days'] = (df['timestamp'] - df['account_creation_date']).dt.days
    df['hour'] = df['timestamp'].dt.floor('H')

    # ========== POINT ANOMALY ==========
    def detect_point_anomaly(data, threshold_drop=1.5, min_ratings=4):
        hist = data[data['timestamp'] < data['timestamp'].max().floor('H')]
        if hist.empty:
            return set(), None
        baseline_avg = hist['rating'].mean()
        grouped = data.groupby('hour').agg(avg=('rating', 'mean'), count=('rating', 'count')).reset_index()
        anomalies = grouped[(baseline_avg - grouped['avg'] >= threshold_drop) & (grouped['count'] >= min_ratings)]
        if anomalies.empty:
            return set(), None
        hours = anomalies['hour'].tolist()
        flagged = data[data['hour'].isin(hours)]
        return set(flagged['user_id']), anomalies

    # ========== COLLECTIVE ANOMALY ==========
    def detect_collective_anomaly(data, age_thresh=7, min_same_rating=4):
        new_users = data[data['account_age_days'] < age_thresh]
        if new_users.empty:
            return set(), "No new users"
        mode_rating = new_users['rating'].mode()[0]
        suspicious = new_users[new_users['rating'] == mode_rating]
        if len(suspicious) >= min_same_rating:
            return set(suspicious['user_id']), f"{len(suspicious)} new users gave same rating {mode_rating}"
        return set(), "No suspicious cluster"

    # ========== CONTEXTUAL ANOMALY ==========
    def detect_contextual_anomaly(data, drop_thresh=1.0):
        sorted_df = data.sort_values('timestamp')
        mid = len(sorted_df) // 2
        past = sorted_df.iloc[:mid]
        recent = sorted_df.iloc[mid:]
        if len(past) < 3 or len(recent) < 3:
            return set(), "Not enough data"
        avg_past = past['rating'].mean()
        avg_recent = recent['rating'].mean()
        if avg_past - avg_recent >= drop_thresh:
            return set(recent['user_id']), f"Drop from {avg_past:.2f} → {avg_recent:.2f}"
        return set(), "No drop"

    # ========== Run All Detections ==========
    print("\n🔍 Running anomaly detection...")

    all_flagged = set()

    point_users, point_detail = detect_point_anomaly(df)
    if point_users:
        print(f"\n📌 Point Anomaly Detected:\n{point_detail}")
        print(f"Flagged users: {point_users}")
        all_flagged.update(point_users)
    else:
        print("✅ No Point Anomaly")

    collective_users, coll_reason = detect_collective_anomaly(df)
    if collective_users:
        print(f"\n🚨 Collective Anomaly Detected: {coll_reason}")
        print(f"Flagged users: {collective_users}")
        all_flagged.update(collective_users)
    else:
        print(f"✅ No Collective Anomaly — {coll_reason}")

    contextual_users, context_reason = detect_contextual_anomaly(df)
    if contextual_users:
        print(f"\n📅 Contextual Anomaly Detected: {context_reason}")
        print(f"Flagged users: {contextual_users}")
        all_flagged.update(contextual_users)
    else:
        print(f"✅ No Contextual Anomaly — {context_reason}")

    if all_flagged:
        print(f"\n⚠️ Final flagged users (total {len(all_flagged)}): {all_flagged}")
    else:
        print("\n🎉 No anomalies detected in user reviews.")



🔍 Running anomaly detection...
✅ No Point Anomaly

🚨 Collective Anomaly Detected: 4 new users gave same rating 5.0
Flagged users: {8, 9, 10, 7}

📅 Contextual Anomaly Detected: Drop from 4.67 → 2.46
Flagged users: {4, 5, 6, 9, 10}

⚠️ Final flagged users (total 7): {4, 5, 6, 7, 8, 9, 10}


In [20]:
from datetime import timedelta

if all_flagged:
    print("\n⛔ Suspending flagged users for 24 hours...")
    suspend_until = datetime.now() + timedelta(hours=24)

import os
import mysql.connector

db = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
    database=os.getenv("DB_NAME")
)
    for user_id in all_flagged:
        cursor.execute(
            "UPDATE users SET suspended_until = %s WHERE user_id = %s",
            (suspend_until, user_id)
        )

    conn.commit()
    cursor.close()
    conn.close()

    print(f"✅ Suspended {len(all_flagged)} users until {suspend_until.strftime('%Y-%m-%d %H:%M:%S')}")



⛔ Suspending flagged users for 24 hours...
✅ Suspended 7 users until 2025-07-20 14:32:28


In [14]:
# updated_model_trainer.py

import pandas as pd
import mysql.connector
from datetime import datetime
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder

# Step 1: Connect to MySQL and fetch data
def fetch_data():
import os
import mysql.connector

db = mysql.connector.connect(
    host=os.getenv("DB_HOST"),
    user=os.getenv("DB_USER"),
    password=os.getenv("DB_PASSWORD"),
    database=os.getenv("DB_NAME")
)
    cursor.execute("""
        SELECT r.user_id, r.vendor_id, r.rating, r.timestamp,
               u.created_at AS account_creation_date
        FROM reviews r
        JOIN users u ON r.user_id = u.user_id
    """)
    data = pd.DataFrame(cursor.fetchall())
    cursor.close()
    conn.close()
    return data

# Step 2: Preprocess data
def preprocess_data(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['account_creation_date'] = pd.to_datetime(df['account_creation_date'])
    df['account_age'] = (df['timestamp'] - df['account_creation_date']).dt.days
    df['hour'] = df['timestamp'].dt.hour
    le = LabelEncoder()
    df['vendor_encoded'] = le.fit_transform(df['vendor_id'])
    X = df[['rating', 'account_age', 'hour', 'vendor_encoded']]
    return X

# Step 3: Build and train pipeline
def train_pipeline(X):
    pipeline = Pipeline([
        ('model', IsolationForest(n_estimators=100, contamination=0.15, random_state=42))
    ])
    pipeline.fit(X)
    return pipeline

# Step 4: Full process to train only (no saving)
def run():
    print("📥 Fetching data...")
    df = fetch_data()
    if df.empty:
        print("⚠️ No data found in database.")
        return

    print("🔧 Preprocessing and training model...")
    X = preprocess_data(df)
    pipeline = train_pipeline(X)
    print("✅ Model trained. You can now save it separately if needed.")
    return pipeline

# Optional direct run
if __name__ == "__main__":
    run()


📥 Fetching data...
🔧 Preprocessing and training model...
✅ Model trained. You can now save it separately if needed.




In [15]:
import pickle
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline

# 1. Create dummy vendor review data (5 rows)
df = pd.DataFrame({
    'rating': [4.5, 4.6, 4.7, 1.0, 1.2],
    'account_age': [90, 80, 75, 1, 2],
    'hour': [10, 12, 9, 8, 8],
    'vendor_encoded': [0, 0, 0, 0, 0]
})

# 2. Build pipeline
pipeline = Pipeline([
    ('model', IsolationForest(n_estimators=100, contamination=0.15, random_state=42))
])

# 3. Train the model
pipeline.fit(df)

# 4. Save it as anomaly_model.pkl in the same folder
with open("anomaly_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("✅ anomaly_model.pkl has been created and saved!")




✅ anomaly_model.pkl has been created and saved!
