In [2]:
import pandas as pd

df = pd.read_pickle("../reviews_segment.pkl")
df.head()

Unnamed: 0,review_id,product_id,customer_id,review_title,review_written_date,customer_name,review_from_title,review_text,helpful_count,out_of_helpful_count,customer_review_rating,number_of_comments,amazon_verified_purchase,amazon_vine_program_review,review_with_metadata
0,'R10019MUX6F9A','B00006881R','AWNC1GQ75W8K8','Works as advertised','2002-12-17','Neil','TeleZapper TZ 900 (Office Product)','I\'ve had this product for about a month and ...,7,7,4,0,0,0,"('R10019MUX6F9A', 'B00006881R', 'AWNC1GQ75W8K8..."
1,'R1002I943QCT20','B00471F0NK','A3SFG0OC59UXL5',"'macintosh version - bad graphics, bad interface'",'2007-06-05','D. Simons','null','I have been using the Macintosh OSX version o...,21,23,2,0,0,0,"('R1002I943QCT20', 'B00471F0NK', 'A3SFG0OC59UX..."
2,'R1003RILN06MX1','B0027U258Q','A2IP26LJGTJXSV','Great Software','2010-12-05','Tex','Paragon Partition Manager 10 Personal Edition...,'The Partition Manager is a great product. It\...,1,1,5,0,1,0,"('R1003RILN06MX1', 'B0027U258Q', 'A2IP26LJGTJX..."
3,'R100523NBIQIEV','B000070MRB','A2DKAPBHZ5DERR','Neutral','2004-06-07','S. Barnes','Game Programming Starter Kit 6.0 (CD-ROM)','If you plan on getting this program go to htt...,-1,-1,3,0,0,0,"('R100523NBIQIEV', 'B000070MRB', 'A2DKAPBHZ5DE..."
4,'R1006KJEGKGV0O','B001B19D7I','AMZ7EO048MCWK',"'Great seat, but don\'t like the buckle'",'2009-07-07','Cyrca','Britax Boulevard 65 TSIP Convertible Car Seat...,'I researched for months (on-line and in store...,1,1,5,0,0,0,"('R1006KJEGKGV0O', 'B001B19D7I', 'AMZ7EO048MCW..."


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [6]:
df['cleaned_review'] = df['review_text'].astype(str).apply(preprocess_text)

In [7]:
from collections import defaultdict

word_index = defaultdict(set)

for index, row in df.iterrows():
    document_id = row["review_id"].strip("'")
    
    for word in set(row["cleaned_review"].split()):
        word_index[word].add(document_id)

print(len(word_index))

153247


In [35]:
def boolean_baseline(aspect, opinion, operator1, operator2, operator3, filepath): # This is the baseline boolean function
    aspect_tokens = aspect.lower().split()
    opinion_tokens = opinion.lower().split()
    
    a1 = aspect_tokens[0]
    a2 = aspect_tokens[1] if len(aspect_tokens) > 1 else None
    o1 = opinion_tokens[0] if len(opinion_tokens) > 0 else None
    o2 = opinion_tokens[1] if len(opinion_tokens) > 1 else None

    a1 = lemmatizer.lemmatize(a1)
    a2 = lemmatizer.lemmatize(a2) if a2 else None
    o1 = lemmatizer.lemmatize(o1) if o1 else None
    o2 = lemmatizer.lemmatize(o2) if o2 else None
    
    if operator1 == "AND":
        aspect_docs = word_index.get(a1, set()).intersection(word_index.get(a2, set()))
    elif operator1 == "OR":
        aspect_docs = word_index.get(a1, set()).union(word_index.get(a2, set()))
    else:
        aspect_docs = word_index.get(a1, set())

    if operator2 == "AND":
        opinion_docs = word_index.get(o1, set()).intersection(word_index.get(o2, set()))
    elif operator2 == "OR":
        opinion_docs = word_index.get(o1, set()).union(word_index.get(o2, set()))
    else:
        opinion_docs = word_index.get(o1, set())

    if operator3 == "AND":
        result_docs = aspect_docs.intersection(opinion_docs)
    elif operator3 == "OR":
        result_docs = aspect_docs.union(opinion_docs)
    else:
        result_docs = aspect_docs

    with open(f"../Outputs/Baseline/{filepath}", "w") as f:
        for document_id in result_docs:
            f.write(f"{document_id}\n")
        

    return result_docs

In [36]:
results = boolean_baseline("audio quality", "poor", operator1="OR", operator2="", operator3="", filepath="audio_quality_test1.txt")
results = boolean_baseline("audio quality", "poor", operator1="OR", operator2="", operator3="AND", filepath="audio_quality_test2.txt")
results = boolean_baseline("audio quality", "poor", operator1="OR", operator2="", operator3="OR", filepath="audio_quality_test3.txt")

results = boolean_baseline("wifi signal", "strong", operator1="OR", operator2="", operator3="", filepath="wifi_signal_test1.txt")
results = boolean_baseline("wifi signal", "strong", operator1="OR", operator2="", operator3="AND", filepath="wifi_signal_test2.txt")
results = boolean_baseline("wifi signal", "strong", operator1="OR", operator2="", operator3="OR", filepath="wifi_signal_test3.txt")

results = boolean_baseline("mouse button", "click problem", operator1="OR", operator2="OR", operator3="", filepath="mouse_button_test1.txt")
results = boolean_baseline("mouse button", "click problem", operator1="OR", operator2="OR", operator3="AND", filepath="mouse_button_test2.txt")
results = boolean_baseline("mouse button", "click problem", operator1="OR", operator2="OR", operator3="OR", filepath="mouse_button_test3.txt")

results = boolean_baseline("gps map", "useful", operator1="OR", operator2="", operator3="", filepath="gps_map_test1.txt")
results = boolean_baseline("gps map", "useful", operator1="OR", operator2="", operator3="AND", filepath="gps_map_test2.txt")
results = boolean_baseline("gps map", "useful", operator1="OR", operator2="", operator3="OR", filepath="gps_map_test3.txt")

results = boolean_baseline("image quality", "sharp", operator1="OR", operator2="", operator3="", filepath="image_quality_test1.txt")
results = boolean_baseline("image quality", "sharp", operator1="OR", operator2="", operator3="AND", filepath="image_quality_test2.txt")
results = boolean_baseline("image quality", "sharp", operator1="OR", operator2="", operator3="OR", filepath="image_quality_test3.txt")

In [10]:
def load_opinion_lexicon(positive_file="positive-words.txt", negative_file="negative-words.txt"):
    positive_words = set()
    negative_words = set()

    with open(positive_file, 'r', encoding='utf-8', errors='ignore') as pos_file:
        for line in pos_file:
            line = line.strip()
            if line and not line.startswith(';'):
                positive_words.add(line)
    
    with open(negative_file, 'r', encoding='utf-8', errors='ignore') as neg_file:
        for line in neg_file:
            line = line.strip()
            if line and not line.startswith(';'):
                negative_words.add(line)
    
    return positive_words, negative_words

In [11]:
positive_words, negative_words = load_opinion_lexicon()

print(f"Length of positive words: {len(positive_words)}")
print(f"Length of negative words: {len(negative_words)}")

print("Sample of positive words:", list(positive_words)[:10])
print("Sample of negative words:", list(negative_words)[:10])

Length of positive words: 2006
Length of negative words: 4783
Sample of positive words: ['glimmer', 'courage', 'amazement', 'applaud', 'luxury', 'eye-catching', 'concise', 'meticulously', 'civilize', 'pepped']
Sample of negative words: ['unskilled', 'repulsing', 'carnage', 'unkindly', 'fickle', 'glum', 'outrages', 'bewilder', 'unrealistic', 'irked']


In [12]:
def get_opinion_sentiment(opinion):

    if opinion in positive_words:
        return "positive"
    elif opinion in negative_words:
        return "negative"
    

    return "neutral"

In [34]:
def m1(aspect, opinion, operator1, operator2, operator3, filepath):
    aspect_tokens = aspect.lower().split()
    opinion_tokens = opinion.lower().split()

    a1 = aspect_tokens[0]
    a2 = aspect_tokens[1] if len(aspect_tokens) > 1 else None
    o1 = opinion_tokens[0] if len(opinion_tokens) > 0 else None
    o2 = opinion_tokens[1] if len(opinion_tokens) > 1 else None

    a1 = lemmatizer.lemmatize(a1)
    a2 = lemmatizer.lemmatize(a2) if a2 else None
    o1 = lemmatizer.lemmatize(o1) if o1 else None
    o2 = lemmatizer.lemmatize(o2) if o2 else None

    if operator1 == "AND":
        aspect_docs = word_index.get(a1, set()).intersection(word_index.get(a2, set()))
    elif operator1 == "OR":
        aspect_docs = word_index.get(a1, set()).union(word_index.get(a2, set()))
    else:
        aspect_docs = word_index.get(a1, set())

    if operator2 == "AND":
        opinion_docs = word_index.get(o1, set()).intersection(word_index.get(o2, set()))
    elif operator2 == "OR":
        opinion_docs = word_index.get(o1, set()).union(word_index.get(o2, set()))
    else:
        opinion_docs = word_index.get(o1, set())

    if operator3 == "AND":
        result_docs = aspect_docs.intersection(opinion_docs)
    elif operator3 == "OR":
        result_docs = aspect_docs.union(opinion_docs)
    else:
        result_docs = aspect_docs

    
    sentiment = get_opinion_sentiment(o2) if len(opinion_tokens) > 1 else get_opinion_sentiment(o1)
    
    print(f"Sentiment: {sentiment}")

    filtered_results = set()

    for document_id in result_docs:
        review = df[df['review_id'].str.strip("'") == document_id]
        if not review.empty:
            rating = review['customer_review_rating']

            rating = int(rating.values[0])
            if sentiment == "positive" and rating > 3:
                filtered_results.add(document_id)
            elif sentiment == "negative" and rating <= 3:
                filtered_results.add(document_id)
            elif sentiment == "neutral":
                filtered_results.add(document_id)
                
    with open(f"../Outputs/AdvancedModel/{filepath}", "w") as f:
        for document_id in filtered_results:
            f.write(f"{document_id}\n")

    return filtered_results

In [33]:
results = m1("audio quality", "poor", operator1="AND", operator2="", operator3="AND", filepath="audio_quality_m1_test4.txt")

results = m1("wifi signal", "strong", operator1="AND", operator2="", operator3="AND", filepath="wifi_signal_m1_test4.txt")

results = m1("mouse button", "click problem", operator1="AND", operator2="AND", operator3="AND", filepath="mouse_button_m1_test4.txt")

results = m1("gps map", "useful", operator1="AND", operator2="", operator3="AND", filepath="gps_map_m1_test4.txt")

results = m1("image quality", "sharp", operator1="AND", operator2="", operator3="AND", filepath="image_quality_m1_test4.txt")

Sentiment: negative
Sentiment: positive
Sentiment: negative
Sentiment: positive
Sentiment: positive


In [37]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch

def m2(aspect, opinion, filepath):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    bert_embeddings = pd.read_pickle("../data.pkl")
    embeddings = np.stack(bert_embeddings['embedding'].values)
    bert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    threshold = 0.70
    
    target_sentence = f"The {aspect} is {opinion}"
    
    query_embeddings = bert_model.encode([target_sentence], convert_to_numpy=True, device=device)

    cos_similarities = cosine_similarity(embeddings, query_embeddings).ravel()
    matched_indices = np.where(cos_similarities >= threshold)[0]

    opinion_tokens = opinion.lower().split()
    
    o1 = opinion_tokens[0] if len(opinion_tokens) > 0 else None
    o2 = opinion_tokens[1] if len(opinion_tokens) > 1 else None

    o1 = lemmatizer.lemmatize(o1) if o1 else None
    o2 = lemmatizer.lemmatize(o2) if o2 else None

    sentiment = get_opinion_sentiment(o2) if len(opinion_tokens) > 1 else get_opinion_sentiment(o1)

    result_docs = set()
    for index in matched_indices:
        document_id = bert_embeddings.iloc[index]['document_id']
        
        review = df[df['review_id'].str.strip("'") == document_id]
        if not review.empty:

            rating = review['customer_review_rating']

            rating = int(rating.values[0])

            if sentiment == "positive" and rating > 3:
                result_docs.add(document_id)
            elif sentiment == "negative" and rating <= 3:
                result_docs.add(document_id)
            elif sentiment == "neutral":
                result_docs.add(document_id)
                
    with open(f"../Outputs/AdvancedModel/{filepath}", "w") as f:
        for document_id in result_docs:
            f.write(f"{document_id}\n")

    return result_docs

In [38]:
results = m2("audio quality", "poor", filepath="audio_quality_m2_test4.txt")

results = m2("wifi signal", "strong", filepath="wifi_signal_m2_test4.txt")

results = m2("mouse button", "click problem", filepath="mouse_button_m2_test4.txt")

results = m2("gps map", "useful", filepath="gps_map_m2_test4.txt")

results = m2("image quality", "sharp", filepath="image_quality_m2_test4.txt")