## Part 1: Pull Yelp user reviews based on location and category

In [1]:
# Import libraries
import requests
import pandas as pd
import time
import tensorflow as tf
from sklearn.model_selection import train_test_split
import random

from dotenv import load_dotenv, find_dotenv
import os

# Locate .env file and load it
dotenv_path = find_dotenv()
if dotenv_path:
    load_dotenv(dotenv_path)
else:
    raise FileNotFoundError(".env file not found!")

API_KEY = os.getenv("YELP_API_KEY")
# print("API_KEY:", API_KEY)
if not API_KEY:
    raise ValueError("API_KEY is missing! Check your .env file.")

print("API Key loaded successfully!")  # Remove this after debugging

API Key loaded successfully!


In [2]:
def search_businesses(term, location, food_style=None, limit=50, max_results=200):
    """Searches for businesses using pagination while respecting Yelp API limits."""
    url = "https://api.yelp.com/v3/businesses/search"
    headers = {"Authorization": f"Bearer {API_KEY}"}
    businesses = []
    offset = 0

    search_variations = [term, f"{term} {random.choice('ABCDEF')}", f"{term} - best", f"{term} near me"]
    sort_options = ["best_match", "rating", "review_count"]

    while len(businesses) < max_results:
        # Ensure we do not exceed Yelp's max offset limit of 240
        remaining_results = min(max_results - len(businesses), 50)
        if offset + remaining_results > 240:
            break  # Stop fetching if the limit exceeds 240

        params = {
            "term": random.choice(search_variations),
            "location": location,
            "limit": remaining_results,
            "offset": offset,
            "sort_by": random.choice(sort_options)  # Use different sort orders
        }

        # Add food_style to params if provided
        if food_style:
            params["categories"] = food_style

        response = requests.get(url, headers=headers, params=params)

        if response.status_code == 200:
            data = response.json()
            new_businesses = data.get("businesses", [])
            if not new_businesses:
                break  # No more businesses to fetch

            businesses.extend(new_businesses)
            offset += limit  # Move to the next page
            
            # Check rate limit headers and handle the rate limit properly
            rate_limit_remaining = int(response.headers.get('X-RateLimit-Remaining', 0))
            if rate_limit_remaining == 0:
                reset_time = int(response.headers.get('X-RateLimit-Reset', time.time()))
                wait_time = reset_time - time.time()
                if wait_time > 0:
                    print(f"Rate limit exceeded. Sleeping for {wait_time} seconds until reset...")
                    time.sleep(wait_time)  # Sleep until reset time
            else:
                time.sleep(random.uniform(3, 5))  # Random sleep to avoid hitting rate limit
        elif response.status_code == 429:
            print("Rate limit hit! Sleeping before retrying...")
            # Handle rate limit by checking reset time
            reset_time = int(response.headers.get('X-RateLimit-Reset', time.time()))
            wait_time = reset_time - time.time()
            if wait_time > 0:
                print(f"Rate limit hit! Sleeping for {wait_time} seconds until reset...")
                time.sleep(wait_time)  # Wait until reset time
        else:
            print(f"Error searching businesses: {response.status_code}, {response.json()}")
            break

    return [
        {
            "id": business["id"],
            "name": business["name"],
            "location": ", ".join(business["location"]["display_address"]),
            "category": business["categories"][0]["title"] if business.get("categories") else "Unknown",
            "rating": business.get("rating", 0),
            "review_count": business.get("review_count", 0)
        }
        for business in businesses
    ]

def get_reviews(business_id):
    """Retrieves up to 3 reviews for a given business ID."""
    url = f"https://api.yelp.com/v3/businesses/{business_id}/reviews"
    headers = {"Authorization": f"Bearer {API_KEY}"}
    
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json().get("reviews", [])
    elif response.status_code == 429:
        print("Rate limit hit while fetching reviews! Sleeping before retrying...")
        # Implementing exponential backoff to manage retry attempts
        reset_time = int(response.headers.get('X-RateLimit-Reset', time.time()))
        wait_time = reset_time - time.time()
        if wait_time > 0:
            print(f"Rate limit hit while fetching reviews! Sleeping for {wait_time} seconds...")
            time.sleep(wait_time)
        return get_reviews(business_id)  # Retry
    else:
        print(f"Error fetching reviews: {response.status_code}, {response.json()}")
        return []

def extract_review_data(reviews, business):
    """Extracts relevant data from reviews, including category."""
    return [
        {
            "review_id": review["id"],
            "user_id": review["user"]["id"],
            "rating": review["rating"],
            "text": review["text"],
            "time_created": review["time_created"],
            "business_id": business["id"],
            "business_name": business["name"],
            "location": business["location"],
            "category": business["category"]
        }
        for review in reviews
    ]

def get_filtered_reviews(term, location, rating_threshold=4, min_reviews=2000, max_results_per_location=500, food_style=None):
    """Gets reviews for businesses until we reach at least min_reviews count, filtering by food_style if provided."""
    all_reviews = []
    attempts = 0

    while len(all_reviews) < min_reviews and attempts < 10:  # Limit retries to avoid infinite loop
        print(f"Fetching businesses in {location} (attempt {attempts + 1})...")
        businesses = search_businesses(term, location, max_results=max_results_per_location, food_style=food_style)

        for business in businesses:
            reviews = get_reviews(business["id"])
            filtered_reviews = [r for r in extract_review_data(reviews, business) if r["rating"] >= rating_threshold]
            all_reviews.extend(filtered_reviews)

            time.sleep(random.uniform(3, 5))  # Randomized sleep to prevent rate limits

            if len(all_reviews) >= min_reviews:
                break  # Stop early if we've collected enough

        print(f"Total reviews collected so far: {len(all_reviews)}")
        attempts += 1

    df = pd.DataFrame(all_reviews)
    
    if len(df) < min_reviews:
        print(f"Warning: Only {len(df)} reviews collected, below the target of {min_reviews}.")

    df.to_csv("./Resources/yelp_filtered_reviews.csv", index=False)
    return df

# EXAMPLE USAGE
location = "Atlanta, GA"
term = "Restaurants"
food_style = "Italian"  # User input for food style
rating_threshold = 4  # Only get reviews with rating 4 or higher
review_df = get_filtered_reviews(term, location, rating_threshold=rating_threshold, food_style=food_style)

print(f"Saved {len(review_df)} reviews to yelp_filtered_reviews.csv")
display(review_df.head())

Fetching businesses in Atlanta, GA (attempt 1)...
Total reviews collected so far: 508
Fetching businesses in Atlanta, GA (attempt 2)...
Total reviews collected so far: 1039
Fetching businesses in Atlanta, GA (attempt 3)...
Total reviews collected so far: 1529
Fetching businesses in Atlanta, GA (attempt 4)...
Total reviews collected so far: 2001
Saved 2001 reviews to yelp_filtered_reviews.csv


Unnamed: 0,review_id,user_id,rating,text,time_created,business_id,business_name,location,category
0,a0xawbSfMoWU86xR367ysg,TTtgJzRUbOON32xVQsjErg,5,Fantastic service and a great atmosphere! If y...,2025-02-27 18:24:22,eG-UO83g_5zDk70FIJbm2w,South City Kitchen Midtown,"1144 Crescent Ave NE, Atlanta, GA 30309",Southern
1,_96BtfdQTpMMPA72cgl_9A,r2Cbc1xIuJxctEJQhUozFg,5,"As I searched for someplace great to eat, reas...",2025-03-26 06:40:55,eG-UO83g_5zDk70FIJbm2w,South City Kitchen Midtown,"1144 Crescent Ave NE, Atlanta, GA 30309",Southern
2,-PR8KeyLBF8cDo48ziVudA,4om9-6STuScS1BbLyYS36Q,4,"Delicious food, good drinks, fast service. I w...",2025-03-11 16:26:41,eG-UO83g_5zDk70FIJbm2w,South City Kitchen Midtown,"1144 Crescent Ave NE, Atlanta, GA 30309",Southern
3,w-k3D7aJOsKYh0Wdz9CzbQ,nuO42bJn2Mu498bbalMNqw,5,Whiskey Bird is your typical neighborhood rest...,2025-03-30 12:22:17,dfL1KYHtcs6YaFVx-nZTdQ,Whiskey Bird,"1409 North Highland Ave NE, Atlanta, GA 30306",New American
4,-XnUa3lVPM4HOF3uS9SD2w,Q7z79xji4Jube-VuaEe32A,5,Came here for a birthday brunch and we were im...,2025-03-12 10:45:19,dfL1KYHtcs6YaFVx-nZTdQ,Whiskey Bird,"1409 North Highland Ave NE, Atlanta, GA 30306",New American


# Get user_ids from yelp_reviews_with_user_info.csv

In [4]:
import pandas as pd

# Load your labeled dataset
df = pd.read_csv("./Resources/yelp_reviews_with_mood_labels.csv")

# Extract unique user_ids and save to CSV
user_ids_df = pd.DataFrame(df["user_id"].unique(), columns=["user_id"])
user_ids_df.to_csv("./Resources/unique_user_ids.csv", index=False)