In [None]:
!pip install anthropic

### Setup your own API key
* Login & setup your API key from here (https://console.anthropic.com/settings/keys)
* Paste your API key in the following inverted commas

In [None]:
import os
# Set the environment variable within the notebook
os.environ["ANTHROPIC_API_KEY"] = "your_api_key"

***Following code is just a sample, your task is to encapsulate this code in a function with arguments of your choice so that function can be reused***

#### Following link is for reference: https://docs.anthropic.com/en/docs/initial-setup#next-steps

In [4]:
import anthropic
# importing all the neccesarry libraries
# for data handling
import numpy as np
import pandas as pd
import json

# for web scraping
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

import time
from datetime import date

In [78]:
client = anthropic.Anthropic()

def queryFun(query):
    response = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=500, # length of the response
    temperature=0, # probability of randomness in output
    # system is the prompt given to set up context
    system="""You are an AI assistant trained to categorize user feedback into predefined categories, 
            along with sentiment analysis for each category. Your goal is to analyze each piece of feedback, 
            assign the most relevant categories Predefined Categories: Food, 
            Staff/Service and Environment. Firstly, analyze all the things is about. Make sure the answer is tab separated 
            first telling all the things it was about then tab then was it good or bad then tab then reason then 
            tab and then review itself. If the review was about more than 1 thing separate each concern with / tab then their
            relevant part of sentence also being separated with a tab
            For example for query: "I enjoyed the food but the service was poor"
            output: "Food/Serivce   I enjoyed the food/but the service was poor"
            Also only variables we're analyzing are food, service/staff and environment. Rest of things like prices, personal information,
            and anything tied to a specific person should be ignored. Also Staff/Service will be singled out as Service
            And lastly put another tab and after it the date but if the date is something like "N days ago". Do not change the year
            subtact N days from today's date and return date in format DD/MM/YYYY also for "Dined on this date" convert date into 
            DD/MM/YY. Also if 2 or more sentences talk about the same thing only keep one part of it and for compound sentences 
            separate concerns around the coordinate conjunctions (for, and, nor, but, or, yet, so). Hence the
            number of concerns will always be the same as the number of relevant sentences
            So final output format will be
            Concern relevant part of sentence   date
            For example:
            input: "I liked the food but it took too long to arrive Dined today (today's date = 12/12/2024)"
            output: "Food/Service\tI liked the food/but it took too long to arrive\t12/12/2024
            input: "I liked the food but it took too long to arrive Dined on March 23rd, 2021 (today's date = 12/12/2024)"
            output: "Food/Service\tI liked the food/but it took too long to arrive\t23/03/2021
            where \t is tab
            format of output: concerns\trelevant part of sentence\tdate
            """,
    # message is what we provide to the Model
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": query
                }
            ]
        }
    ]
)
    return response.content[0].text

In [None]:
reveiw_div = r"afkKaa-4T28-" # list of reviews (list tag)
reviewer_info = r"_0Uufw15R3a4-" # info about the reveiwers (section tag)
review_class = r"MpiILQAMSSg-" # info about the reviews (div tag)

# meta data getters
reviewer_place = r"POyqzNMT21k- C7Tp-bANpE4-" # branch that was reviewed (p tag)
review_info_div = r"_6rFG6U7PA6M-" # review (div tag)

review_whole_rating = r"yuQYBV659bs-" 
rating_info = r"R9NyqT2lRqw-" # stars div (div tag)
review_stars = r"tSiVMQB9es0-" # number of stars (div tag)

dates_class = "iLkEeQbexGs-"

today = date.today() # todays date
formatted_date = today.strftime("%d/%m/%Y")

In [28]:
link = r"https://www.opentable.com/r/nobu-downtown-new-york?corrid=6d9decfc-52bf-4a13-b971-bc8d9cddf57e&avt=eyJ2IjoyLCJtIjoxLCJwIjowLCJzIjowLCJuIjowfQ&p=2&sd=2024-12-01T19%3A00%3A00"

driver = webdriver.Chrome()
driver.get(link)

In [6]:
def scrapeData(driver, reviews:list, cities:list, dates: list, stars:list, ratings: list, names:list):
    found = False
    reloaded = False
    while not found: 
        XPATH_for_cities = rf"//section[@class='{reviewer_info}']//p[@class='{reviewer_place}']"
        XPATH_for_reviews = rf"//div[@class='{review_class}']//div[@class='{review_info_div}']"
        # Find nested element using XPath
        scrapeCities = driver.find_elements(By.XPATH, XPATH_for_cities) # getting cities
        scrapeReviews = driver.find_elements(By.XPATH, XPATH_for_reviews) # getting reviews
        scrapeDates = driver.find_elements(By.XPATH, f"//p[@class='iLkEeQbexGs-']") # getting dates
        scrapeStars = driver.find_elements(By.CSS_SELECTOR, f".R9NyqT2lRqw- .{review_stars}") # getting stars
        scrapeRatings = driver.find_elements(By.CSS_SELECTOR, f".{all_ratings}") # getting ratings   
        scrapeNames = driver.find_elements(By.CSS_SELECTOR, f".{reviewer_name}") # getting names

        if (len(scrapeCities) == 0):
            driver.refresh() # reload pages if things havent been loaded
            time.sleep(1)
            reloaded = True
            continue;
        
        # print(len(scrapeReviews), len(scrapeCities), len(scrapeDates), len(scrapeStars), )
            
        for i in range(len(scrapeReviews)):
            if "Read more" not in scrapeReviews[i].text and len(scrapeReviews[i].text) < 150:
                reviews.append(scrapeReviews[i].text)
                cities.append(scrapeCities[i].text)
                dates.append(scrapeDates[i].text)
                stars.append(scrapeStars[i].get_attribute("innerHTML"))
                ratings.append((scrapeRatings[i].text).replace('\n', ' '))
                names.append(scrapeNames[i].text)
        found = True
    print("Reloaded: ", reloaded)
    return reloaded

    # element = driver.find_elements(By.CSS_SELECTOR, f".{review_class} .{review_info_div}") # commenting for future referencing 
def scrapeResturant(driver):    
    reviews = []
    cities = []
    dates = []
    stars = []
    ratings = []
    names = []

    next_button_class = "ojKcSDzr190- y4S9mw-uCFI- g-dxt-fQ2ZU- C7Tp-bANpE4-"
    next_ = driver.find_elements(By.XPATH, f"//div[@class='TkpxbcBbu80-']//a[@class='{next_button_class}']")
    if len(next_) > 1:
        next_ = next_[1]
    else:
        next_ = next_[0]
        
    while next_.is_displayed():
        scrapeData(driver, reviews, cities, dates, stars, ratings, names) # tells if the page was reloaded
        next_ = driver.find_elements(By.XPATH, f"//div[@class='TkpxbcBbu80-']//a[@class='{next_button_class}']")
        
        # next_.click() # idk why drive click isnt working
        # Using JavaScript to click an element
        
        if len(next_) > 1:
            next_ = next_[1]
            driver.execute_script("arguments[0].click();", next_)
        else:
            next_ = next_[0]
            driver.execute_script("arguments[0].click();", next_)
        time.sleep(1)
    return (reviews, cities, dates, stars, ratings, names)


In [34]:
reviews = []
stars = [] 
dates = []
cities = []
names = []
ratings = []



def save_to_tsv(file_name: str, content):
    content = pd.DataFrame(content)
    content.to_csv(file_name, sep='\t', index=False, encoding='utf-8')

def save_content(names, dates, cities, reviews, stars): 
    choice = input("Do you want to save you data? (Y/N)")
    if choice == 'Y' or choice == 'y' and len(reviews) > 0:   
        content = {
            "Names":names,
            "dates": dates,
            "cities": cities,
            "reviews": reviews,
            "ratings": ratings,
            "stars": stars
        }

        save_to_tsv('data.tsv', pd.DataFrame(content))
save_content(names, dates, cities, reviews, stars)

def readData():
    global reviews, cities, stars, dates, names, ratings
    choice = input("Read Local (L) or Scrape Data (S)");
    if choice == 'L' or choice == 'l':
        df = pd.read_csv('data.tsv', sep='\t', encoding='utf-8')
        reviews = df['reviews']
        stars = df['stars']
        dates = df['dates']
        cities = df['cities']
        names = df['Names']
        ratings = df['ratings']
        
    elif choice == 'S' or choice == 's':
        choice = int(input("Enter approx number of reviews to scrape: "))
        (reviews, cities, dates, stars, ratings, names) = scrapeResturant(driver)
readData()    
driver.close()
    

In [None]:
complete_review = {}


In [91]:

back_off = 1
MAX_TRIES_PER_QUERY = 3

for i in range(674, 950):
    print(f"Processing review ({i}): {reviews[i]}")
    review = f"{reviews[i]} {dates[i]} (today's date = {formatted_date})"
    tries = 0

    while tries < MAX_TRIES_PER_QUERY:
        try:
            review = queryFun(review)  # Attempt the query
            back_off = 1  # Reset backoff on success
            print("Reviewed!")
            break  # Exit retry loop
        except Exception as e:
            print(f"Error in review ({i}): {reviews[i]} - Attempt {tries + 1} failed. Error: {e}")
            tries += 1
            back_off = min(back_off * 2, 64)  # Cap the backoff to 64 seconds
            time.sleep(back_off)  # Wait before retrying

    # If the query failed after all retries
    if tries == MAX_TRIES_PER_QUERY:
        print(f"Skipping review ({i}) after {MAX_TRIES_PER_QUERY} attempts.")
        back_off = 1
        continue

    # Parse the response
    """Food/Environment Food was good/Environment was bad   Date"""
    # Food/Service   Food was amazing/but Jan was rude to me   11/11/2023
    response = review.split('\t')
    if len(response) < 3:
        continue;
    print(response)
    concerns = response[0].split('/')
    reasons = response[1].split('/')
    date = response[2]
    
    if (len(concerns) != len(reasons)):
        continue;

    if complete_review.get(date, None) is None:
        complete_review[date] = []

    summary = {}
    categories = {}
    for j in range(len(concerns)): 
        if "Food" in concerns[j]:
            categories["Food"] = reasons[j]
        elif "Service" in concerns[j]:
            categories["Service"] = reasons[j]
        elif "Environment" in concerns[j]:
            categories["Environment"] = reasons[j]

    summary['name'] = names[i]
    summary["categories"] = categories
    summary["feedback"] = reviews[i]
    summary['rating'] = ratings[i]
    summary["stars"] = float(stars[i])
    summary["city"] = cities[i]

    complete_review[date].append(summary)
    time.sleep(12)  # Adhere to rate limit

# Output the completed review dictionary
# print(len(complete_review), '\n', complete_review)
# with open('review_analysis.json', 'w') as file:
#     json.dump(complete_review, file, indent=4)

Processing review (674): Too expensive. Service was not that good. Quality is only ok.
Reviewed!
['Service/Food', 'Service was not that good/Quality is only ok', '10/09/2022']
Processing review (675): Great place and food
Reviewed!
['Environment/Food', 'Great place/and food', '09/09/2022']
Processing review (676): Fantastic, highly recommended.
Reviewed!
['Food', 'Fantastic, highly recommended', '07/09/2022']
Processing review (677): Amazing evening! Nobu never disappoints. Fantastic service and food.
Reviewed!
['Food/Service', 'Fantastic service and food', '06/09/2022']
Processing review (678): Send off dinner
Reviewed!
['Service', 'Send off dinner', '05/09/2022']
Processing review (679): The Ambience, Cuisine and Service was top Tier. Went for a bday dinner and had a great experience.
Reviewed!
['Environment/Food/Service', 'The Ambience, Cuisine and Service was top Tier', '04/09/2022']
Processing review (680): Great food!! Service was very good and great atmosphere!
Reviewed!
['Food/

In [92]:
print(len(complete_review), '\n', complete_review)
#next time append donot write
with open('review_analysis.json', 'w') as file:
    json.dump(complete_review, file, indent=4)

492 
 {'11/12/2024': [{'name': 'Jeff', 'categories': {'Food': 'Food incredible'}, 'feedback': 'Always amazing dining experience.\nFood incredible. A must stop in NYC.', 'rating': 'Overall 5 Food 5 Service 5 Ambience 5', 'stars': 4.7, 'city': 'Charleston'}], '10/12/2024': [{'name': 'Stacey', 'categories': {'Food': 'Nobu is excellent'}, 'feedback': 'Nobu is excellent as always! Always my favorite restaurant.', 'rating': 'Overall 5 Food 5 Service 5 Ambience 5', 'stars': 4.7, 'city': 'Orlando'}, {'name': 'CC', 'categories': {'Food': 'Great dinner', 'Service': 'and great, fast service'}, 'feedback': 'Great dinner and great, fast service. The food was excellent.', 'rating': 'Overall 5 Food 5 Service 5 Ambience 5', 'stars': 5.0, 'city': 'New York City'}], '06/12/2024': [{'name': 'Yvonne', 'categories': {'Service': 'She said two very obnoxious things on the way out'}, 'feedback': 'It was all excellent except for the rude coat check girl. She said two very obnoxious things on the way out. Not a

In [93]:
complete_review

{'11/12/2024': [{'name': 'Jeff',
   'categories': {'Food': 'Food incredible'},
   'feedback': 'Always amazing dining experience.\nFood incredible. A must stop in NYC.',
   'rating': 'Overall 5 Food 5 Service 5 Ambience 5',
   'stars': 4.7,
   'city': 'Charleston'}],
 '10/12/2024': [{'name': 'Stacey',
   'categories': {'Food': 'Nobu is excellent'},
   'feedback': 'Nobu is excellent as always! Always my favorite restaurant.',
   'rating': 'Overall 5 Food 5 Service 5 Ambience 5',
   'stars': 4.7,
   'city': 'Orlando'},
  {'name': 'CC',
   'categories': {'Food': 'Great dinner',
    'Service': 'and great, fast service'},
   'feedback': 'Great dinner and great, fast service. The food was excellent.',
   'rating': 'Overall 5 Food 5 Service 5 Ambience 5',
   'stars': 5.0,
   'city': 'New York City'}],
 '06/12/2024': [{'name': 'Yvonne',
   'categories': {'Service': 'She said two very obnoxious things on the way out'},
   'feedback': 'It was all excellent except for the rude coat check girl. She

In [None]:
queryFun(f" Great dinner and great, fast service. The food was excellent. Dined on November 11, 2023 (today's date = {formatted_date})")