In [1]:
import pandas as pd

df = pd.read_pickle("reviews_segment.pkl")
df.head()

Unnamed: 0,review_id,product_id,customer_id,review_title,review_written_date,customer_name,review_from_title,review_text,helpful_count,out_of_helpful_count,customer_review_rating,number_of_comments,amazon_verified_purchase,amazon_vine_program_review,review_with_metadata
0,'R10019MUX6F9A','B00006881R','AWNC1GQ75W8K8','Works as advertised','2002-12-17','Neil','TeleZapper TZ 900 (Office Product)','I\'ve had this product for about a month and ...,7,7,4,0,0,0,"('R10019MUX6F9A', 'B00006881R', 'AWNC1GQ75W8K8..."
1,'R1002I943QCT20','B00471F0NK','A3SFG0OC59UXL5',"'macintosh version - bad graphics, bad interface'",'2007-06-05','D. Simons','null','I have been using the Macintosh OSX version o...,21,23,2,0,0,0,"('R1002I943QCT20', 'B00471F0NK', 'A3SFG0OC59UX..."
2,'R1003RILN06MX1','B0027U258Q','A2IP26LJGTJXSV','Great Software','2010-12-05','Tex','Paragon Partition Manager 10 Personal Edition...,'The Partition Manager is a great product. It\...,1,1,5,0,1,0,"('R1003RILN06MX1', 'B0027U258Q', 'A2IP26LJGTJX..."
3,'R100523NBIQIEV','B000070MRB','A2DKAPBHZ5DERR','Neutral','2004-06-07','S. Barnes','Game Programming Starter Kit 6.0 (CD-ROM)','If you plan on getting this program go to htt...,-1,-1,3,0,0,0,"('R100523NBIQIEV', 'B000070MRB', 'A2DKAPBHZ5DE..."
4,'R1006KJEGKGV0O','B001B19D7I','AMZ7EO048MCWK',"'Great seat, but don\'t like the buckle'",'2009-07-07','Cyrca','Britax Boulevard 65 TSIP Convertible Car Seat...,'I researched for months (on-line and in store...,1,1,5,0,0,0,"('R1006KJEGKGV0O', 'B001B19D7I', 'AMZ7EO048MCW..."


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [6]:
df['cleaned_review'] = df['review_text'].astype(str).apply(preprocess_text)

In [None]:
from collections import defaultdict

word_index = defaultdict(set)

for index, row in df.iterrows():
    document_id = row["review_id"].strip("'")
    
    for word in set(row["cleaned_review"].split()):
        word_index[word].add(document_id)

print(len(word_index))

153247


In [None]:
def boolean_baseline(aspect, opinion, operator1, operator2, operator3): # This is the baseline boolean function
    aspect_tokens = aspect.lower().split()
    opinion_tokens = opinion.lower().split()
    
    a1 = aspect_tokens[0]
    a2 = aspect_tokens[1] if len(aspect_tokens) > 1 else None
    o1 = opinion_tokens[0]
    o2 = opinion_tokens[1] if len(opinion_tokens) > 1 else None

    a1 = lemmatizer.lemmatize(a1)
    a2 = lemmatizer.lemmatize(a2) if a2 else None
    o1 = lemmatizer.lemmatize(o1)
    o2 = lemmatizer.lemmatize(o2) if o2 else None
    
    if operator1 == "AND":
        aspect_docs = word_index.get(a1, set()).intersection(word_index.get(a2, set()))
    elif operator1 == "OR":
        aspect_docs = word_index.get(a1, set()).union(word_index.get(a2, set()))
    else:
        aspect_docs = word_index.get(a1, set())

    if operator2 == "AND":
        opinion_docs = word_index.get(o1, set()).intersection(word_index.get(o2, set()))
    elif operator2 == "OR":
        opinion_docs = word_index.get(o1, set()).union(word_index.get(o2, set()))
    else:
        opinion_docs = word_index.get(o1, set())

    if operator3 == "AND":
        result_docs = aspect_docs.intersection(opinion_docs)
    elif operator3 == "OR":
        result_docs = aspect_docs.union(opinion_docs)
    else:
        result_docs = aspect_docs # We return docs having the aspect if no 3rd operator is given

    return result_docs

In [None]:
results = boolean_baseline("audio quality", "poor", operator1="AND", operator2="AND", operator3="AND")
for val in results:
    print(val)

In [None]:
def load_opinion_lexicon(positive_file="positive-words.txt", negative_file="negative-words.txt"):
    positive_words = set()
    negative_words = set()

    with open(positive_file, 'r', encoding='utf-8', errors='ignore') as pos_file:
        for line in pos_file:
            line = line.strip()
            if line and not line.startswith(';'):
                positive_words.add(line)
    
    with open(negative_file, 'r', encoding='utf-8', errors='ignore') as neg_file:
        for line in neg_file:
            line = line.strip()
            if line and not line.startswith(';'):
                negative_words.add(line)
    
    return positive_words, negative_words

In [None]:
positive_words, negative_words = load_opinion_lexicon()

print(f"Length of positive words: {len(positive_words)}")
print(f"Length of negative words: {len(negative_words)}")

print("Sample of positive words:", list(positive_words)[:10])
print("Sample of negative words:", list(negative_words)[:10])

In [None]:
def get_opinion_sentiment(opinion):

    if opinion in positive_words:
        return "positive"
    elif opinion in negative_words:
        return "negative"
    

    return "neutral"

In [None]:
def m1(aspect, opinion, operator1, operator2, operator3):
    aspect_tokens = aspect.lower().split()
    opinion_tokens = opinion.lower().split()

    a1 = aspect_tokens[0]
    a2 = aspect_tokens[1] if len(aspect_tokens) > 1 else None
    o1 = opinion_tokens[0]
    o2 = opinion_tokens[1] if len(opinion_tokens) > 1 else None

    a1 = lemmatizer.lemmatize(a1)
    a2 = lemmatizer.lemmatize(a2) if a2 else None
    o1 = lemmatizer.lemmatize(o1)
    o2 = lemmatizer.lemmatize(o2) if o2 else None

    if operator1 == "AND":
        aspect_docs = word_index.get(a1, set()).intersection(word_index.get(a2, set()))
    elif operator1 == "OR":
        aspect_docs = word_index.get(a1, set()).union(word_index.get(a2, set()))
    else:
        aspect_docs = word_index.get(a1, set())

    if operator2 == "AND":
        opinion_docs = word_index.get(o1, set()).intersection(word_index.get(o2, set()))
    elif operator2 == "OR":
        opinion_docs = word_index.get(o1, set()).union(word_index.get(o2, set()))
    else:
        opinion_docs = word_index.get(o1, set())

    if operator3 == "AND":
        result_docs = aspect_docs.intersection(opinion_docs)
    elif operator3 == "OR":
        result_docs = aspect_docs.union(opinion_docs)
    else:
        result_docs = aspect_docs # We return docs having the aspect if no 3rd operator is given

    
    sentiment = get_opinion_sentiment(o2) if len(opinion_tokens) > 1 else get_opinion_sentiment(o1)

    filtered_results = set()

    for document_id in result_docs:
        review = df[df['review_id'].str.strip("'") == document_id]
        if not review.empty:
            rating = review['customer_review_rating']

            if sentiment == "positive" and rating > 3:
                filtered_results.add(document_id)
            elif sentiment == "negative" and rating <= 3:
                filtered_results.add(document_id)
            elif sentiment == "neutral":
                filtered_results.add(document_id)

    return filtered_results

In [None]:
results = m1("audio quality", "poor", operator1="AND", operator2="AND", operator3="AND")
for val in results:
    print(val)

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from torch import cosine_similarity

def m2(aspect, opinion):
    bert_embeddings = pd.read_pickle("data.pkl")
    embeddings = np.stack(bert_embeddings['embedding'].values)
    bert_model = SentenceTransformer('all-MiniLM-L6-v2')
    threshold = 1.0
    
    possible_templates = [
        f"{aspect} {opinion}",
        f"The {aspect} is {opinion}",
        f"I think the {aspect} is {opinion}",
        f"{aspect} is {opinion}",
    ]
    
    query_embeddings = bert_model.encode(possible_templates, convert_to_numpy=True)

    cos_similarities = cosine_similarity(embeddings, query_embeddings).ravel()
    matched_indices = np.where(cos_similarities >= threshold)[0]

    opinion_tokens = opinion.lower().split()
    
    o1 = opinion_tokens[0]
    o2 = opinion_tokens[1] if len(opinion_tokens) > 1 else None

    o1 = lemmatizer.lemmatize(o1) if o1 else None
    o2 = lemmatizer.lemmatize(o2) if o2 else None

    sentiment = get_opinion_sentiment(o2) if len(opinion_tokens) > 1 else get_opinion_sentiment(o1)

    result_docs = set()
    for index in matched_indices:
        document_id = bert_embeddings.iloc[index]['document_id']
        
        review = df[df['review_id'].str.strip("'") == document_id]
        if not review.empty:

            rating = review['customer_review_rating']

            if sentiment == "positive" and rating > 3:
                result_docs.add(document_id)
            elif sentiment == "negative" and rating <= 3:
                result_docs.add(document_id)
            elif sentiment == "neutral": # If the opinion is actually neutral OR the user did not provide an opinion
                result_docs.add(document_id)

    return result_docs

In [None]:
results = m2("audio quality", "poor")
for val in results:
    print(val)