# Initialize

In [73]:
from tqdm import tqdm
from bs4 import BeautifulSoup
import json
from llm_helper import extract_imp_aspects, extract_aspect_ratings3
import pandas as pd
import time

In [74]:
import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [75]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import json
import uuid
import re

In [76]:
def format_input(info):
    return f"""{info['title']}: {info['content']}[Rating - {info['rating']}/5]"""

In [80]:
FILENAME = "blackstove_36_3"
CATEGORY = "grills"

# FILENAME = "product"
# CATEGORY = "grills"

In [81]:
soup = BeautifulSoup(open(f'html/{FILENAME}.html').read(), "lxml")

In [82]:
reviews = soup.find_all('div', attrs={'class' : 'ReviewRowstyles__ReviewRowWrapper-sc-1v62ao2-1 jcytau'})
rating = [review.find('meta') for review in reviews]
image = soup.find('img', attrs={'id':'imgZoom0'})
image_link = re.search(r'src=\"(.*)\"', str(image)).group(1)
title = soup.find('h1', {"class" :"styles__H1-sc-11vpuyu-0 krJSUv typography variant--h1 align--left product-brand-description"}).text
item_id_str = soup.find('p', {"class": "styles__ParagraphRegular-sc-1ljw3tp-0 doewXW typography variant--body_small align--left body_2"}).text
item_id = re.search(r'#(\d+)', item_id_str).group(1)

In [83]:
price_dollar = soup.find('span', {"class": "item-price-dollar"}).text
price_cents = soup.find('span', {"class": "PriceUIstyles__Cent-sc-14j12uk-0 bktBXX item-price-cent"}).text
price = str(price_dollar + price_cents).strip()

In [84]:
with open("product_meta.json", 'r') as fp:
    data = json.load(fp)

product_meta = {
    "product_id" : item_id,
    "product_title" : title,
    "product_image" : image_link,
    "number_of_reviews" : len(reviews),
    "product_category" : CATEGORY,
    "product_price" : price
}
p_id = [product["product_id"] for product in data]
if item_id in p_id:
    pass
else:
    data.append(product_meta)    
    with open("product_meta.json", 'w') as fp:
        json.dump(data, fp, indent=2)

In [85]:
product_meta

{'product_id': '2582823',
 'product_title': 'Blackstone 36" Culinary Griddle with Hood 4-Burner Liquid Propane Flat Top Grill',
 'product_image': 'https://mobileimages.lowes.com/productimages/c6fce411-41c3-4879-b402-7fc6c7ffe08f/67037864.png?size=mpdhi',
 'number_of_reviews': 2220,
 'product_category': 'grills',
 'product_price': '$499.00'}

# Extract reviews

In [87]:
extracted_reviews = []
rejected_reviews = []

for i in range(len(reviews)):
    if i%2 == 0:
        title = reviews[i].find('p', attrs={'class': 'h1'}).text
        try:
            content = reviews[i].find('p', attrs={'itemprop': 'reviewBody'}).text
            rating = reviews[i].find('meta', attrs={'itemprop': 'ratingValue'})['content']
        
            extracted_reviews.append({
                "title": title,
                "content": content,
                "rating": rating
            })
        except:
            rejected_reviews.append(i)        
len(extracted_reviews)

661

In [88]:
extracted_reviews[0]

{'title': 'Grill = great but issues with Lowe’s assembly x2',
 'content': 'Love my Blackstone grill (eating 5/5) and the Lowe’s delivery service was punctual and provided great service (rating 5/5). \nThe challenges with both deliveries were related to assembly. First grill was just sloppy assembly with bolts not fully tightened (they were actually falling out upon inspection) which caused damage to the frame assembly. Second delivery was better but grill was missing the stoppers which protect/eliminate direct, metal-on-metal contact between lid and body which caused heat resistant paint to chip. Rating for assembly and follow up service = 2/5\nLowe’s promised a credit which has yet to be processed or confirmed. \nMoral of the story…inspect your delivery closely and understand that the delivery team has nothing to do with assembly.',
 'rating': '3'}

In [89]:
with open("category_aspects.json", 'r') as fp:
    data = json.load(fp)
data

{'grills': {'cooking_surface': 'The size and material of the cooking surface, including its heat distribution and non-stick properties.',
  'portability': 'The ease of moving and transporting the griddle, including its weight, size, and any additional features like wheels or handles.',
  'heat_control': 'The ability to adjust and control the temperature of the griddle for different cooking needs, including even heat distribution and consistent cooking results.',
  'durability': 'The sturdiness and longevity of the griddle, including its materials, construction, and resistance to rust or corrosion.',
  'accessories': "The additional items included in the bundle, such as the cover, side shelves, or other tools that enhance the griddle's functionality and convenience."},
 'lawn_movers': {'cutting_performance': 'The ability of the lawn mower to effectively cut grass with precision and consistency.',
  'ease_of_use': 'How user-friendly and convenient the lawn mower is to operate, including 

# Review scoring

In [90]:
if CATEGORY not in data:
    aspects = extract_imp_aspects(product_meta['product_title'])
    data[CATEGORY] = aspects
    with open("category_aspects.json", 'w') as fp:
        json.dump(data, fp, indent=2)
        print("saved")
aspects = data[CATEGORY]

In [91]:
FILE_FOLDER = "temp_csv"

In [92]:
def make_batch(reviews:list, items_per_batch=10):
    batches = []
    number_of_batches = len(reviews)//items_per_batch + 1
    for i in range(number_of_batches):
        batches.append(reviews[i*items_per_batch: (i+1)*items_per_batch])
    print(len(batches), "batches created")
    return batches

In [93]:
def merge_batches(files:list):
    compiled_df = pd.DataFrame() 
    for file in files:
        try:
            df = pd.read_csv(FILE_FOLDER+f"/{file}.csv")
            compiled_df = pd.concat([compiled_df, df])
        except:
            pass
    
    compiled_df.to_csv(f"{product_meta['product_id']}.csv", index=False)
    print("Files merged and saved")

In [94]:
def process_batch(batch:list):
    batch_review_ratings = []
    unique_temp_filename = str(uuid.uuid1())
    
    for i, review in enumerate(batch):
        try:
            batch_review_rating = extract_aspect_ratings3(format_input(review), aspects, review['rating'])
            batch_review_ratings.append(batch_review_rating)
        except ConnectionError as e:
            print("Connection lost")
            time.sleep(2)
    
    df_batch = pd.DataFrame(batch_review_ratings)

    df_batch.to_csv(FILE_FOLDER+f"/{unique_temp_filename}.csv", index=False, header=True)
    print("Batch complete")
    return unique_temp_filename 

In [95]:
result = []

In [96]:
batches = make_batch(extracted_reviews,items_per_batch=30)
with ThreadPoolExecutor(max_workers=6) as exe:
    result = list(exe.map(process_batch, batches))

23 batches created
'NoneType' object has no attribute 'group'
ASPECT: accessories
1. The overall review is POSITIVE.
2. The reviewer mentions the lid, the box, the basket for seasoning, and the assembling process as accessories.
3. The reviewer's opinion on the accessories is MIXED, with some components being iffy but overall still functional.
Batch complete
Batch complete
Batch complete
Batch complete
Batch complete
Batch complete
'NoneType' object has no attribute 'group'
ASPECT: heat_control
1. The overall review is NEGATIVE.
2. The reviewer did not specifically mention the heat_control aspect of the product.
3. The reviewer did not provide an opinion on the heat_control aspect.
'NoneType' object has no attribute 'group'
ASPECT: accessories
1. The overall review is POSITIVE
2. The reviewer does not mention any specific accessories included with the grill.
3. The reviewer does not provide an opinion on the accessories.
'NoneType' object has no attribute 'group'
ASPECT: portability
1.

In [97]:
merge_batches(result)

Files merged and saved


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("data_files/2582823.csv")

In [4]:
df.columns

Index(['review_content', 'review_rating', 'raw_cooking_surface',
       'overall_cooking_surface', 'cooking_surface_opinion', 'cooking_surface',
       'raw_portability', 'overall_portability', 'portability_opinion',
       'portability', 'raw_heat_control', 'overall_heat_control',
       'heat_control_opinion', 'heat_control', 'raw_durability',
       'overall_durability', 'durability_opinion', 'durability',
       'raw_accessories', 'overall_accessories', 'accessories_opinion',
       'accessories'],
      dtype='object')

In [5]:
def change_mistake(response):
    # print(response)
    try:
        pattern = r'POSITIVE|NEUTRAL|NEGATIVE|NOT MENTIONED|[nN]ot (specifically|directly )?[mM]ention(ed)?'
        list_response = [x for x in response.split("\n") if x != '' if x != '\n']
        # print(list_response[1])
        # answers = [re.search(pattern, ans).group() if i != 1 else ans[3:] for ans in list_response]
        
        return list_response[1][3:]
    except:
        return "NA"

In [6]:
for aspect in ['cooking_surface', 'portability', 'heat_control', 'durability', 'accessories']:
    new = []
    for item in list(df[f'raw_{aspect}']):
        new.append(change_mistake(item))
    df[f'{aspect}_opinion'] = new

In [7]:
df.to_csv("data_files/2582823.csv", index=False)

In [5]:
import pandas as pd

In [14]:
df = pd.read_csv('data_files/5188090.csv')
len(df)

600

In [15]:
df.dropna(inplace=True)
len(df)

566

In [37]:
index = [row for i,row in df.iterrows() if len(row['review_content'])<50]

In [39]:
index[0]

review_content                          : Definitely worth it![Rating - 5/5]
review_rating                                                              5
raw_cooking_surface        1. The overall review is POSITIVE\n2. The revi...
overall_cooking_surface                                             POSITIVE
cooking_surface_opinion    The reviewer mentions that the cooking_surface...
cooking_surface                                                     POSITIVE
raw_portability            1. The overall review is POSITIVE\n2. The revi...
overall_portability                                                 POSITIVE
portability_opinion        The reviewer does not specifically mention por...
portability                                                    NOT MENTIONED
raw_heat_control           1. The overall review is POSITIVE\n2. The revi...
overall_heat_control                                                POSITIVE
heat_control_opinion       The reviewer does not specifically mention hea...

In [40]:
type(index[0])

pandas.core.series.Series