In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import re
import string
import nltk
import random
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
warnings.filterwarnings("ignore") #, category=DeprecationWarning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pd.set_option('display.max_colwidth', 500)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load Office Product Amazon Reviews as pandasDF
def parse(path):
    g = gzip.open(path, "rb")
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient="index")
# For colab
data_path = "/content/drive/MyDrive/NLP/Office_Products_5.json.gz" #Change1: "../Office_Products_5.json.gz"
data = getDF(data_path)

data.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='object')

In [4]:
# Considered only required columns for further analysis
df = data[["reviewerID","asin","overall","reviewText","summary"]]

print("before", df[df["reviewText"].isnull()].overall.value_counts())

# Initial trail deleting the rows with null values
df.dropna(subset=['reviewText'], inplace=True)

print("after", df[df["reviewText"].isnull()].overall.value_counts())
df["overall"].value_counts()

before 5.0    184
4.0     25
3.0      2
2.0      2
Name: overall, dtype: int64
after Series([], Name: overall, dtype: int64)


5.0    570809
4.0    122864
3.0     50700
1.0     31697
2.0     24074
Name: overall, dtype: int64

In [5]:
sentiment_values = []
for i in df['overall'].values:
    if (i >= 4.):
        sentiment_values.append('positive')
    elif (i < 4. or i > 2.):
        sentiment_values.append('neutral')
    elif (i <= 2.):
        sentiment_values.append('negative')

df['sentiment'] = sentiment_values
df.head()

Unnamed: 0,reviewerID,asin,overall,reviewText,summary,sentiment
0,A2NIJTYWADLK57,140503528,4.0,kids like story BUT while i really wanted a board book this one is just so small. wish larger,"good story, small size book though",positive
1,A2827D8EEURMP4,140503528,4.0,"Bought this used and it came in great condition, almost like it had never been read. The story isn't as good as the original corduroy but my husband had this version as a child so had to add it to our daughter collection",Good,positive
2,APB6087F4J09J,140503528,5.0,Every story and book about Corduroy is Fantastic. This book is great and I bought all the Corduroy books for my 2 boys and now for their total of 5 children. You have to buy a Corduroy bear for everyone who has the books. Love to hold them while the stories are read.,Best Books for All Children,positive
3,A2DHERRZIPFU7X,140503528,5.0,"I purchased this book for my first grade classroom. I read the book to the students during a math lesson. The first day, I split the students into groups and had them count how many pockets they had all together. They recorded it on a chart and answered questions like: Who had the most? Who had the least? Choose two numbers and compare using <,>,=. The second day we referenced the book and compared pockets of the boys vs the girls. They again answered questions about the numbers. They loved ...",Great for Math!,positive
4,A2XCLJRGFANRC,140503528,5.0,"Having spent numerous years in an elementary school library, I can say with all honesty that the Corduroy books were great favorites for all the years I was there. A Pocket for Corduroy is a charming addition to anyone's collection. It is a very sweet story about the owner of Corduroy who leaves him in the laundry and has some difficulties retrieving him. Children can relate to that quite well. Getting him back is a good lesson in itself. Sewing the pocket on for him is delightful. It has a ...",Love Corduroy,positive


In [6]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    """
    Preprocesses a given text by performing the following steps:
    1. Convert text to lowercase
    2. Remove digits
    3. Remove punctuations
    4. Remove special characters
    5. Remove stop words
    6. Lemmatize the text
    
    Args:
    text (str): The text to preprocess
    
    Returns:
    str: The preprocessed text
    """
    # Convert text to lowercase
    text = text.lower()
    
    # Remove digits
    text = ''.join([i for i in text if not i.isdigit()])
    
    # Remove punctuations
    text = ''.join([i for i in text if i not in string.punctuation])
    
    # Remove special characters
    text = ''.join(e for e in text if e.isalnum() or e.isspace())
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if not word in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    
    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
from gensim.models import LdaModel, CoherenceModel
from gensim.corpora import Dictionary
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'][:10000], df['sentiment'][:10000], test_size=0.3, stratify=df['sentiment'][:10000], random_state=42)

print("Shape of training data: ", X_train.shape)
print("Shape of testing data: ", X_test.shape)
print("Shape of training labels: ", y_train.shape)
print("Shape of testing labels: ", y_test.shape)
final_df = df[:20000]

Shape of training data:  (7000,)
Shape of testing data:  (3000,)
Shape of training labels:  (7000,)
Shape of testing labels:  (3000,)


In [9]:
from sklearn import svm

# Train the SVC classifier for sentiment analysis
vectorizer = TfidfVectorizer(preprocessor=preprocess_text)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

svc_classifier = svm.SVC(probability=True, random_state = 42)
%time svc_classifier.fit(X_train_vectorized, y_train)

# Test the classifier's performance
y_pred = svc_classifier.predict(X_test_vectorized)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Apply the classifier to the Amazon product review dataset
final_df['sentiment_label'] = svc_classifier.predict(vectorizer.transform(final_df['reviewText']))
final_df[['sentiment_label', 'sentiment']]

CPU times: user 48.4 s, sys: 74.9 ms, total: 48.5 s
Wall time: 57.6 s
Accuracy: 0.903
              precision    recall  f1-score   support

     neutral       0.95      0.06      0.11       308
    positive       0.90      1.00      0.95      2692

    accuracy                           0.90      3000
   macro avg       0.93      0.53      0.53      3000
weighted avg       0.91      0.90      0.86      3000



Unnamed: 0,sentiment_label,sentiment
0,positive,positive
1,positive,positive
2,positive,positive
3,positive,positive
4,positive,positive
...,...,...
20000,positive,positive
20001,positive,positive
20002,positive,positive
20003,positive,positive


In [10]:
# Topic modeling (using NMF)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, preprocessor=preprocess_text)
X = tfidf_vectorizer.fit_transform(final_df['reviewText'])
nmf = NMF(n_components=20, random_state=42).fit(X)
W = nmf.transform(X)

# Map dominant topics to the original dataset
final_df['dominant_topic'] = W.argmax(axis=1)

In [11]:
# User profile building
user_profiles = final_df.groupby('reviewerID').agg({
    'sentiment_label': lambda x: x.value_counts().index[0],  # Most frequent sentiment label
    'reviewText': ' '.join,
    'dominant_topic': lambda x: x.value_counts().index[0]
}).reset_index()

user_profiles

Unnamed: 0,reviewerID,sentiment_label,reviewText,dominant_topic
0,A0220159ZRNBTRKLG08H,positive,"The quality of this note book is excellent.\nI was expecting lines on the paper, but my fault for not fully ready the description.\n\nThis note book is going to be great for drawings a notes.\n\nLove it!",9
1,A0526222H977CBZM4DK7,positive,EXCELLENT,9
2,A1009UWCCRSHY7,positive,love this stuff.,5
3,A100DO844MBA4W,positive,I've always liked this product. It's better than regular double sided tape. The cushion really helps.,15
4,A100WFKYVRPVX7,positive,"Great bookmark. For gothic and vampire fans or anyone who reads, for novelty collectors, this is a pretty good item. Bought mine for Pathfinder game books. Beaded part is a nice add to the vampire art design. Comes with a plastic covering protector that you can either leave inside or use just use the bookmark as is. The bookmark is 7in long by 2 1/2 in wide, art image of vampire is only on one side. Back side is blank. Artist is Victoria Frances. This Sun beaded bookmark is very nice, simple...",18
...,...,...,...,...
15409,AZZ5AADBV2YG5,positive,My son is 3 and just learning to read and these are fabulous I highly recommend them,3
15410,AZZHILYMITLGM,positive,I am just experiencing this product and it is such a time saver! I would recommend this product to anyone.,4
15411,AZZKYP9254H32,positive,"On day three, but love it so far. Very clean and simple, but effective. Sticking to the pomederos has been helpful to me. Love the clean layout. Great pocket notebooks. I do like the Field Notes a little better, but these are great as well. I do like these notebooks, but I like the field notes better.",5
15412,AZZV9PDNMCOZW,positive,"This is my standby packaging tape. Easy to use, great adhesion, and good price.",18


In [12]:
W.argmax(axis=1).shape, user_profiles.shape

((20000,), (15414, 4))

In [13]:
# Product profile building
product_profiles = final_df.groupby('asin').agg({
    'sentiment_label': lambda x: x.value_counts().index[0],  # Most frequent sentiment label
    'reviewText': ' '.join,
    'dominant_topic': lambda x: x.value_counts().index[0] # # Assign the dominant topic for each product
}).reset_index()

product_profiles

Unnamed: 0,asin,sentiment_label,reviewText,dominant_topic
0,0140503528,positive,"kids like story BUT while i really wanted a board book this one is just so small. wish larger Bought this used and it came in great condition, almost like it had never been read. The story isn't as good as the original corduroy but my husband had this version as a child so had to add it to our daughter collection Every story and book about Corduroy is Fantastic. This book is great and I bought all the Corduroy books for my 2 boys and now for their total of 5 children. You have to buy a Cord...",3
1,0310432065,positive,"What I needed perfect for my larger bible NIV Zondervan study bible. Bought for a gift Wonderful My husband loves this for carrying his large Bible, notes and notepad. Perfect! I have a very large , large -print bible which won't fit in anything so tried this one but it is way way way too big Very sturdy Excellent item! Lots of room, pockets etc... Fits the Bible well. I wish it was more secure though. It was too small for my Dake Study Bible and too big for my current Bible but using it an...",3
2,0310520347,positive,"My youngest daughter picked this out and she especially loves the pocket to keep her highlighters and pens. Very sturdy. Good buy. This case was just right for my daughter's Adventure Bible. It is soft, pretty, and stylish. She love it and loves carrying it. grandkids love'em Wonderful Product!!! Okay case for the Adventure Bible. Won't last long because of cheapness.\n\nI bought this for my 7yo's adventure Bible to help keep it clean and protected. The material does seem a bit cheap, but ...",5
3,0310802636,positive,Great product. Perfect size. It's soft. It's easy to carry and his Adventure Bible fits perfectly. perfect fit for the bible I purchased for a child. Nice quality and has a storage spot for a pencil as well as a front pocket. grandkids love'em My great grandson loved it! It has plenty of pockets & the Adventure Bible fits perfectly. Okay case for the Adventure Bible. Won't last long because of cheapness.\n\nI bought this for my 3yo's adventure Bible to help keep it clean and protected. T...,3
4,0310806607,positive,"These Bible Book Covers make great covers for Boy Scout Handbooks.\n\nThe Medium in particular has several extra pockets on the front for scouting doodads.\n\nMy son uses the Medium - perfect size for even the ring bound version of the handbook (highly recommend the ring bound over the perfect bound for handbooks - spend the extra dollar or two) - and far preferable (to us) to the velcro closure BSA one (that doesn't have room for much else either).\n\nMy son thought, from pictures, that the...",3
...,...,...,...,...
711,B00004Z6K4,positive,"Getting, and staying, organized is a big challenge. A few years ago I bought a used label printer that uses these labels. I wore that one out, and purchased a new one that also uses these labels. These are labels are a good value! This was an upgrade from a Turbo330 for me. The product works well for my uses. I use it at home and to make labels for my classroom. I have no problems with it at all. Works great. I use them to hold up ta gets at gun range. They work fine and I don't have to p...",12
712,B00004Z6KT,positive,"As expected it was what it was. Use them all the time, especially at Christmas time. Timely delivery. Just what I needed. I've used these for years. Perfect for my HP printer. Great Avery products, although I got a box of 100 sheets when it should have been 250, which means I lost out on quantity pricing with my printer. Pretty easy and straight forward to use. They have a native client too if you don't want to use the web client. Works on MAC. Very nice item with quick shipping. Thank you!...",6
713,B00004Z6N4,positive,"My husband installed this under my countertop for my computer. Works great. For the price, it is good buy. I went for this instead of fancy flexible ones as most of those under 100 $ were reporting wobble. This one is not that flexible but has no wobble issue. It is wide enough to hold keyboard and mouse. You will need to adjust chair height as Keyboard tray height is fixed at installation time. It does not tilt etc. Works very well, steady. Takes 2 people to install but that is to be expe...",3
714,B00004Z6RY,positive,I use these labels frequently and they are still the best for me. They don't stick in my laser printer and are easy to peel off.,18


In [14]:
# Similarity computation
# Calculate user-user similarity
user_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, preprocessor=preprocess_text)
user_X = user_vectorizer.fit_transform(user_profiles['reviewText'])
user_similarities = cosine_similarity(user_X)

In [15]:
# Calculate product-product similarity
product_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, preprocessor=preprocess_text)
product_X = product_vectorizer.fit_transform(product_profiles['reviewText'])
product_similarities = cosine_similarity(product_X)

In [16]:
# Recommendations and applications
# For the sake of simplicity, we will focus on user-based collaborative filtering
def get_similar_users(user_id, n=10):
    user_idx = user_profiles[user_profiles['reviewerID'] == user_id].index[0]
    similar_users_idx = np.argsort(user_similarities[user_idx])[::-1][:n+1]  # Including the user itself
    similar_users = user_profiles.iloc[similar_users_idx]['reviewerID'].tolist()
    return similar_users[1:]  # Excluding the user itself

def get_recommendations(user_id, n=10):
    similar_users = get_similar_users(user_id)
    similar_users_reviews = final_df[final_df['reviewerID'].isin(similar_users)].groupby('asin')['overall'].mean().reset_index()
    similar_users_reviews = similar_users_reviews.sort_values('overall', ascending=False).head(n)
    return similar_users_reviews['asin'].tolist()

In [17]:
# Test the recommendations
user_id = 'A2NIJTYWADLK57'
recommended_products = get_recommendations(user_id)
print(f'Recommended products for user {user_id}: {recommended_products}')

Recommended products for user A2NIJTYWADLK57: ['1465056610', '8862930003', 'B00000JRRD', 'B00004Z4BR', 'B00004Z5T0', 'B00004Z5TU', '1936266253', '886613645X', 'B00004TS18', '0991557530']


In [18]:
# 7. Evaluation

def get_ground_truth(user_id):
    return final_df[final_df['reviewerID'] == user_id]['asin'].tolist()

def precision_at_k(user_id, k=10):
    recommended_products = get_recommendations(user_id, n=k)
    ground_truth = get_ground_truth(user_id)
    tp = len(set(recommended_products) & set(ground_truth))
    return tp / k

def recall_at_k(user_id, k=10):
    recommended_products = get_recommendations(user_id, n=k)
    ground_truth = get_ground_truth(user_id)
    tp = len(set(recommended_products) & set(ground_truth))
    return tp / len(ground_truth)

In [19]:
# Calculate precision@k and recall@k for a sample of users
sample_users = final_df['reviewerID'].sample(50).tolist()
precisions = [precision_at_k(user_id) for user_id in sample_users]
recalls = [recall_at_k(user_id) for user_id in sample_users]

print(f"Average precision@10: {np.mean(precisions)}")
print(f"Average recall@10: {np.mean(recalls)}")

Average precision@10: 0.068
Average recall@10: 0.515
