In [None]:
!pip install pandas
!pip install scikit-learn
!pip install lightfm
!pip install vaderSentiment


Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=808329 sha256=a88523cbadd8f38494780e5226e492761ea2a824b5ed62256215204960d441e7
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.4 MB/s[0m eta [36m0

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
# Load the datasets
reviews_df = pd.read_csv('/content/productReviews.csv')
products_df = pd.read_csv('/content/productlist.csv')


In [None]:
# Preprocess reviews for sentiment analysis
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Function to perform sentiment analysis
def analyze_sentiment(review):
    score = analyzer.polarity_scores(review)['compound']
    if score >= 0.05:
        return 5  # Positive review -> 5 star rating
    elif score <= -0.05:
        return 1  # Negative review -> 1 star rating
    else:
        return 3  # Neutral review -> 3 star rating


In [None]:
# Apply sentiment analysis to generate ratings
reviews_df['generated_rating'] = reviews_df['review'].apply(analyze_sentiment)

In [None]:
# Merge reviews with product list
merged_df = pd.merge(reviews_df, products_df, on='product_id', how='inner')


In [None]:
# Prepare data for recommendation system
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()


In [None]:
merged_df['user_id_encoded'] = user_encoder.fit_transform(merged_df['Unnamed: 0'])
merged_df['product_id_encoded'] = item_encoder.fit_transform(merged_df['product_id'])


In [None]:
# Create interaction matrix for the BPR model
interaction_matrix = coo_matrix((merged_df['generated_rating'],
                                (merged_df['user_id_encoded'], merged_df['product_id_encoded'])))

In [None]:
# Initialize the Bayesian Personalized Ranking (BPR) model
bpr_model = LightFM(loss='bpr')

In [None]:
# Split data into train and test sets
train_matrix, test_matrix = train_test_split(interaction_matrix, test_size=0.2, random_state=42)


In [None]:
# Train the BPR model
bpr_model.fit(train_matrix, epochs=30, num_threads=4)

<lightfm.lightfm.LightFM at 0x79240d606410>

In [None]:
# Function to recommend similar products
def recommend_products(product_id, model, interaction_matrix, n=5):
    product_encoded = item_encoder.transform([product_id])
    scores = model.predict(0, np.arange(interaction_matrix.shape[1]), item_ids=product_encoded)
    product_ids = np.argsort(-scores)[:n]
    recommended_products = item_encoder.inverse_transform(product_ids)

    # Exclude the input product from recommendations
    return [p for p in recommended_products if p != product_id][:n]

In [None]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from scipy.sparse import coo_matrix

# Prepare the interaction matrix (ensure this part is correct)
interaction_matrix = coo_matrix((merged_df['generated_rating'],
                                (merged_df['user_id_encoded'], merged_df['product_id_encoded'])))

# Initialize the Bayesian Personalized Ranking (BPR) model
bpr_model = LightFM(loss='bpr')

# Train the BPR model (fitting the interaction matrix)
bpr_model.fit(interaction_matrix, epochs=30, num_threads=4)

# Function to recommend similar products
import numpy as np

def recommend_products(product_id, model, interaction_matrix, n=5):
    product_encoded = item_encoder.transform([product_id])[0]  # Get encoded product_id
    # Predict scores for all items for the same user (user 0 in this case)
    scores = model.predict(0, np.arange(interaction_matrix.shape[1]))  # user_id is set to 0 for demo
    product_ids = np.argsort(-scores)[:n+1]  # Sort scores in descending order

    # Get the original product_ids from encoded ones
    recommended_products = item_encoder.inverse_transform(product_ids)

    # Exclude the input product from recommendations
    return [p for p in recommended_products if p != product_id][:n]


product_id_to_recommend = '3935400000000'
recommendations = recommend_products(product_id_to_recommend, bpr_model, interaction_matrix)
print(f"Recommended products for {product_id_to_recommend}: {recommendations}")

# Evaluate the model
precision = precision_at_k(bpr_model, interaction_matrix, k=5).mean()
print(f'Precision@5: {precision}')


Recommended products for 3935400000000: [4098700000000.0, 4347570000000.0, 4567340000000.0, 4337070000000.0, 4498900000000.0]
Precision@5: 0.17741939425468445


In [None]:
def recommend_products(product_id, model, interaction_matrix, n=5):
    product_encoded = item_encoder.transform([product_id])[0]  # Get encoded product_id
    # Predict scores for all items for the same user (user 0 in this case)
    scores = model.predict(0, np.arange(interaction_matrix.shape[1]))  # user_id is set to 0 for demo
    product_ids = np.argsort(-scores)[:n+10]  # Sort scores in descending order, get more to handle exclusion

    # Get the original product_ids from encoded ones
    recommended_product_ids = item_encoder.inverse_transform(product_ids)

    # Exclude the input product from recommendations
    recommended_product_ids = [p for p in recommended_product_ids if p != product_id][:n]

    # Get all details from the productlist dataset for the recommended products (limit to n)
    recommended_products = products_df[products_df['product_id'].isin(recommended_product_ids)].head(n)
    return recommended_products

# Example usage
product_id_to_recommend = '3935400000000'
recommendations = recommend_products(product_id_to_recommend, bpr_model, interaction_matrix)
print(f"Recommended products for {product_id_to_recommend}:\n{recommendations}")


Recommended products for 3935400000000:
     Unnamed: 0    product_id                                  product_name  \
27           33  4.098700e+12                         Living Cleansing Balm   
45           55  4.098700e+12                    Soothing Tea Cleansing Gel   
59           69  4.567340e+12                  Green Tangerine Vita C Serum   
112         139  4.567340e+12              Green Tangerine Vita C Toner Pad   
113         140  4.567340e+12  Green Tangerine Vita C Serum Mask (5 sheets)   

      product_brand   price  \
27   Then I Met You     $38   
45   Then I Met You     $36   
59           GOODAL  $19.99   
112          GOODAL     $24   
113          GOODAL     $15   

                                   product_description product_type  
27   Exclusive to Soko Glam!This 12x award-winning ...     Cleanser  
45   Exclusive to Soko Glam!Founder Charlotte Cho's...     Cleanser  
59   A fast favorite, this gel-like serum is packed...        Serum  
112  In a quick sw

In [None]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357284 sha256=0e29fa684a4dad941cf53d59f125e29ac3e391cfe084227523cb054cd8b5a5fc
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np
from lightfm.evaluation import precision_at_k, recall_at_k
from scipy.sparse import csr_matrix
from sklearn.metrics import roc_auc_score


# Precision@K and Recall@K Evaluation
precision = precision_at_k(bpr_model, interaction_matrix, k=5).mean()
recall = recall_at_k(bpr_model, interaction_matrix, k=5).mean()

print(f'Precision@5: {precision}')
print(f'Recall@5: {recall}')

# Convert the interaction matrix to CSR format for efficient row indexing
interaction_matrix_csr = interaction_matrix.tocsr()

def mean_average_precision(model, interaction_matrix):
    map_scores = []
    for user_id in range(interaction_matrix.shape[0]):
        # Predict scores for all items for the given user
        predictions = model.predict(user_id, np.arange(interaction_matrix.shape[1]))

        # Get the relevant items for this user
        relevant = interaction_matrix[user_id].toarray().ravel() > 0  # Convert to dense array

        # Sort items by predicted scores in descending order
        ranked_indices = np.argsort(-predictions)
        average_precision = 0
        hits = 0

        # Calculate average precision
        for i, idx in enumerate(ranked_indices):
            if relevant[idx]:
                hits += 1
                average_precision += hits / (i + 1)

        if hits > 0:
            average_precision /= hits
        map_scores.append(average_precision)

    return np.mean(map_scores)

# Compute MAP with the CSR-formatted interaction matrix
map_score = mean_average_precision(bpr_model, interaction_matrix_csr)
print(f'MAP: {map_score}')

# Convert the interaction matrix to CSR format for efficient row indexing
interaction_matrix_csr = interaction_matrix.tocsr()





Precision@5: 0.17741939425468445
Recall@5: 0.020119000446041778
MAP: 0.8583524151899898


In [None]:
from sklearn.metrics import ndcg_score

def compute_ndcg(model, interaction_matrix, k=5):
    y_true = []
    y_score = []

    for user_id in range(interaction_matrix.shape[0]):
        true_ratings = interaction_matrix.getrow(user_id).toarray().flatten()
        predictions = model.predict(user_id, np.arange(interaction_matrix.shape[1]))

        # Add true relevance and predicted scores for NDCG
        y_true.append(true_ratings)
        y_score.append(predictions)

    # Convert to numpy arrays
    y_true = np.array(y_true)
    y_score = np.array(y_score)

    # Compute NDCG at K
    ndcg = ndcg_score(y_true, y_score, k=k)
    return ndcg

# Compute NDCG at K (e.g., k=5)
ndcg_score_value = compute_ndcg(bpr_model, interaction_matrix, k=5)
print(f'NDCG@5: {ndcg_score_value}')


NDCG@5: 0.862412855616766


In [None]:
def compute_hit_rate_at_k(model, interaction_matrix, k=5):
    hit_rate = 0.0
    for user_id in range(interaction_matrix.shape[0]):
        true_ratings = interaction_matrix.getrow(user_id).toarray().flatten()
        predictions = model.predict(user_id, np.arange(interaction_matrix.shape[1]))

        # Get top-K recommended items
        top_k_items = np.argsort(-predictions)[:k]

        # Check if there are any hits in the top-K recommendations
        if np.any(true_ratings[top_k_items] > 0):
            hit_rate += 1

    # Compute average hit rate
    return hit_rate / interaction_matrix.shape[0]

# Compute Hit Rate at K (e.g., k=5)
hit_rate_value = compute_hit_rate_at_k(bpr_model, interaction_matrix, k=5)
print(f'Hit Rate@5: {hit_rate_value}')


Hit Rate@5: 0.8870967741935484


In [None]:
def compute_mrr_at_k(model, interaction_matrix, k=5):
    mrr = 0.0
    for user_id in range(interaction_matrix.shape[0]):
        true_ratings = interaction_matrix.getrow(user_id).toarray().flatten()
        predictions = model.predict(user_id, np.arange(interaction_matrix.shape[1]))

        # Get top-K recommended items
        top_k_items = np.argsort(-predictions)[:k]

        # Find the rank of the first relevant item
        relevant_ranks = [rank + 1 for rank, item in enumerate(top_k_items) if true_ratings[item] > 0]

        if relevant_ranks:
            # Add reciprocal of the first relevant rank
            mrr += 1 / min(relevant_ranks)

    # Compute the average MRR across all users
    return mrr / interaction_matrix.shape[0]

# Compute MRR at K (e.g., k=5)
mrr_at_k_value = compute_mrr_at_k(bpr_model, interaction_matrix, k=5)
print(f'MRR@5: {mrr_at_k_value}')


MRR@5: 0.8539426523297492


In [None]:
def compute_coverage_at_k(model, interaction_matrix, k=5):
    recommended_items = set()

    for user_id in range(interaction_matrix.shape[0]):
        predictions = model.predict(user_id, np.arange(interaction_matrix.shape[1]))

        # Get top-K recommended items
        top_k_items = np.argsort(-predictions)[:k]

        # Add recommended items to the set
        recommended_items.update(top_k_items)

    # Compute coverage as the proportion of unique recommended items
    return len(recommended_items) / interaction_matrix.shape[1]

# Compute Coverage at K (e.g., k=5)
coverage_value = compute_coverage_at_k(bpr_model, interaction_matrix, k=5)
print(f'Coverage@5: {coverage_value}')


Coverage@5: 0.8207547169811321
