


Anamoly Detection Using

*   Local Outlier Factor
*   Isolation Forest
*   DBSCAN




In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN

# Load Data
data = pd.read_csv("sample30.csv")

# Encode Users
user_encoder = LabelEncoder()
data['user_id'] = user_encoder.fit_transform(data['reviews_username'].fillna("unknown"))

# Handle Missing Values in Numeric Columns
imputer = SimpleImputer(strategy='median')
data[['reviews_rating', 'reviews_didPurchase', 'reviews_doRecommend']] = imputer.fit_transform(
    data[['reviews_rating', 'reviews_didPurchase', 'reviews_doRecommend']])

# Aggregate User Behavior
user_behavior = data.groupby('user_id').agg(
    avg_rating=('reviews_rating', 'mean'),
    total_reviews=('user_id', 'count'),
    purchase_rate=('reviews_didPurchase', 'mean'),
    recommend_rate=('reviews_doRecommend', 'mean')
).reset_index()

# Normalize Data
scaler = StandardScaler()
features = ['avg_rating', 'total_reviews', 'purchase_rate', 'recommend_rate']
user_behavior[features] = scaler.fit_transform(user_behavior[features])

# Anomaly Detection using Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
user_behavior['iso_anomaly_score'] = iso_forest.fit_predict(user_behavior[features])
user_behavior['iso_is_anomalous'] = user_behavior['iso_anomaly_score'] == -1

# Anomaly Detection using Local Outlier Factor
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
user_behavior['lof_anomaly_score'] = lof.fit_predict(user_behavior[features])
user_behavior['lof_is_anomalous'] = user_behavior['lof_anomaly_score'] == -1

# Anomaly Detection using DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
user_behavior['dbscan_labels'] = dbscan.fit_predict(user_behavior[features])
user_behavior['dbscan_is_anomalous'] = user_behavior['dbscan_labels'] == -1

# Merge Anomaly Data Back to Main Data
data = data.merge(user_behavior[['user_id', 'iso_is_anomalous', 'lof_is_anomalous', 'dbscan_is_anomalous']], on='user_id', how='left')

# Save Processed Data
data.to_csv("processed_data.csv", index=False)

# Print Summary
print("Anomalous Users Detected by Isolation Forest:", user_behavior['iso_is_anomalous'].sum())
print("Anomalous Users Detected by LOF:", user_behavior['lof_is_anomalous'].sum())
print("Anomalous Users Detected by DBSCAN:", user_behavior['dbscan_is_anomalous'].sum())


Anomalous Users Detected by Isolation Forest: 1244
Anomalous Users Detected by LOF: 377
Anomalous Users Detected by DBSCAN: 14


## **Recommendation System Using KMeans**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors

# Load Data
data = pd.read_csv("sample30.csv")

# Encode Users
user_encoder = LabelEncoder()
data['user_id'] = user_encoder.fit_transform(data['reviews_username'])

# Encode Products
product_encoder = LabelEncoder()
data['product_id'] = product_encoder.fit_transform(data['name'])

# Compute TF-IDF Vectors for Categories
tfidf_vectorizer = TfidfVectorizer()
category_tfidf_matrix = tfidf_vectorizer.fit_transform(data['categories'])

# Fit Nearest Neighbors Model
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(category_tfidf_matrix)

# Recommendation Function using TF-IDF + Nearest Neighbors
def recommend_products(username, num_recommendations=3):
    if username not in user_encoder.classes_:
        print("User not found!")
        return []

    user_id = user_encoder.transform([username])[0]
    user_products = data[data['user_id'] == user_id]['product_id'].unique()

    recommended_items = set()
    for product in user_products:
        product_idx = np.where(data['product_id'] == product)[0][0]
        distances, indices = nn_model.kneighbors(category_tfidf_matrix[product_idx], n_neighbors=num_recommendations+1)
        similar_products = data.iloc[indices[0][1:], :]['product_id'].values
        recommended_items.update(similar_products)

    return product_encoder.inverse_transform(list(recommended_items))

# Example Usage
recommended_items = recommend_products("joshua")
print("Recommended Products:", recommended_items)

Recommended Products: ['Red (special Edition) (dvdvideo)'
 'Dark Shadows (includes Digital Copy) (ultraviolet) (dvdvideo)'
 "Jason Aldean - They Don't Know"
 "Cheetos Crunchy Flamin' Hot Cheese Flavored Snacks"
 'Smead174 2 1/4 Inch Accordion Expansion Wallet, Poly, Letter, Translucent Green'
 'Smead174 Recycled Letter Size Manila File Backs W/prong Fasteners, 2 Capacity, 100/box']


In [None]:
user_reviewed_items = data[data['reviews_username'] == "joshua"]['name'].unique()
print("Joshua's Reviewed Products:", user_reviewed_items)

Joshua's Reviewed Products: ['Pink Friday: Roman Reloaded Re-Up (w/dvd)'
 'Dark Shadows (includes Digital Copy) (ultraviolet) (dvdvideo)'
 'Red (special Edition) (dvdvideo)'
 'Smead174 Recycled Letter Size Manila File Backs W/prong Fasteners, 2 Capacity, 100/box'
 "Cheetos Crunchy Flamin' Hot Cheese Flavored Snacks"]
