In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from geopy.distance import geodesic
from geopy.point import Point
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
import folium

In [77]:
#upload sheets with businessess and reviews
file_path_business = './data/yelp_dataset/yelp_academic_dataset_business.json'

business_df = pd.read_json(file_path_business, lines=True)

business_df = business_df[business_df['categories'].str.contains("Restaurants", na=False)]

business_ids_set = set(business_df['business_id'])

file_path_review = './data/yelp_dataset/yelp_academic_dataset_review.json'

filtered_reviews = []
#chunk loading needed as json is too heavy
chunk_size = 100000

for chunk in pd.read_json(file_path_review, lines=True, chunksize=chunk_size)
    filtered_chunk = chunk[chunk['business_id'].isin(business_ids_set)]
    filtered_reviews.append(filtered_chunk)

review_df = pd.concat(filtered_reviews, ignore_index=True)


In [251]:
#attributes consist of nested dictionaries, we have to faltten them to turn into plain text later
business_df['attributes']

3         {'RestaurantsDelivery': 'False', 'OutdoorSeati...
5         {'BusinessParking': 'None', 'BusinessAcceptsCr...
8         {'Caters': 'True', 'Alcohol': 'u'full_bar'', '...
9         {'RestaurantsAttire': ''casual'', 'Restaurants...
11        {'Alcohol': ''none'', 'OutdoorSeating': 'None'...
                                ...                        
150325    {'BikeParking': 'True', 'BusinessAcceptsCredit...
150327    {'WiFi': ''free'', 'RestaurantsGoodForGroups':...
150336    {'WheelchairAccessible': 'False', 'Restaurants...
150339    {'RestaurantsDelivery': 'False', 'BusinessAcce...
150340    {'Ambience': '{'touristy': False, 'hipster': F...
Name: attributes, Length: 52268, dtype: object

In [253]:
#THIS DATASET REQUIRES SERIOUS DATA CLEANING

In [247]:
import re
import ast
import json
import pandas as pd

def try_parse_dict(value):
    if isinstance(value, dict):
        return value
    if value is None or (isinstance(value, float) and pd.isna(value)) or str(value).strip() == '':
        return {}
    #unify format as lowercase
    s = str(value).strip()
    s = re.sub(r"\btrue\b", "True", s, flags=re.IGNORECASE)
    s = re.sub(r"\bfalse\b", "False", s, flags=re.IGNORECASE)
    s = re.sub(r"\bnone\b", "None", s, flags=re.IGNORECASE)
    try:
        parsed = ast.literal_eval(s)
        return parsed if isinstance(parsed, dict) else {}
    except:
        json_str = s.replace("'", '"')
        try:
            parsed_json = json.loads(json_str)
            return parsed_json if isinstance(parsed_json, dict) else {}
        except:
            return {}
#flatten nested dicts, keep hierarchies
def flatten_nested_dict(d):
    flattened = {}
    for key, value in d.items():
        if isinstance(value, str) and value.strip().startswith("{") and value.strip().endswith("}"):
            nested = try_parse_dict(value)
            for nk, nv in nested.items():
                flattened[f"{key}_{nk}"] = nv
        else:
            flattened[key] = value
    return flattened

#leave attribute only if it is true
def clean_attributes(attributes):
    cleaned = {}
    for k, v in attributes.items():
        if v is None or v is False:
            continue
        if isinstance(v, str) and v.strip().lower() in ['false', 'none', '']:
            continue
        cleaned[k] = v
    return cleaned

def parse_and_flatten_attributes(attr_value):
    parsed = try_parse_dict(attr_value)
    flattened = flatten_nested_dict(parsed)
    cleaned = clean_attributes(flattened)
    return cleaned

def transform_key_value(k, v):
    k = k.lower()
    v_str = str(v).lower().strip()

    # Remove 'u'' prefix if present
    if v_str.startswith("u'"):
        v_str = v_str[2:].strip()

    # skip parking handling here (we do it in combine_features)
    if k.startswith("businessparking_"):
        return ""

    # handle alcohol special case
    if k == 'alcohol':
        if v_str == "'full_bar'":
            # If full_bar is present, also mention beer_and_wine
            return 'alcohol_beer_and_wine alcohol_full_bar'
    
    # if value is none or no, skip
    if v_str in ("none", 'no'):
        return ''
    # if value is true or yes, just return the key
    if v_str in ('true', 'yes'):
        return k
    # otherwise return key_value where it is not binary
    return f"{k}_{v_str}".replace(' ', '')

def combine_features(row):
    # process categories: split by comma, strip, remove spaces
    categories_str = row['categories'].lower() if pd.notna(row['categories']) else ''
    categories_list = [cat.strip() for cat in categories_str.split(',')]
    processed_cats = [cat.replace(' ', '') for cat in categories_list]
    categories_str = ' '.join(processed_cats)

    attrs = row['flattened_attributes']

    # determine parking presence, concat all parking options
    parking_attrs = {k.lower(): str(v).lower().strip()
                     for k, v in attrs.items()
                     if k.lower().startswith("businessparking_")}

    true_count = sum(1 for val in parking_attrs.values() if val == "true")
    validated_true = (parking_attrs.get('businessparking_validated') == 'true')

    # parking present logic
    # if no parking attributes are True -> no parking
    # if only validated is True (and nothing else) -> no parking
    # otherwise parking is present
    if true_count == 0:
        parking_present = False
    elif true_count == 1 and validated_true:
        parking_present = False
    else:
        parking_present = (true_count > 0)

    transformed_attrs = []
    for k, v in attrs.items():
        token = transform_key_value(k, v)
        if token:
            transformed_attrs.append(token)

    # if parking is present, add 'parking' once to keep it clean
    if parking_present:
        transformed_attrs.append("parking")

    attributes_str = ' '.join(transformed_attrs)
    
    combined = f"{categories_str} {attributes_str}".strip()
    
    # final cleanup
    combined = re.sub(r'[^\w\s]', '', combined)    # Remove punctuation
    combined = re.sub(r'_+', '_', combined)         # Collapse multiple underscores
    combined = re.sub(r'\s+', ' ', combined)        # Collapse multiple spaces
    combined = combined.strip()

    return combined

#run example
business_df['flattened_attributes'] = business_df['attributes'].apply(parse_and_flatten_attributes)
business_df['combined_features'] = business_df.apply(combine_features, axis=1)
print("Sample combined_features:\n", business_df['combined_features'].iloc[9])


Sample combined_features:
 coffeetea food cafes bars winebars restaurants nightlife caters restaurantsgoodforgroups restaurantspricerange2_2 restaurantstakeout wifi_free noiselevel_average restaurantsattire_casual alcohol_beer_and_wine ambience_hipster ambience_trendy parking


In [245]:
#check how it turned out
business_df['attributes'].iloc[9]

{'OutdoorSeating': 'False',
 'Caters': 'True',
 'RestaurantsDelivery': 'False',
 'RestaurantsGoodForGroups': 'True',
 'RestaurantsPriceRange2': '2',
 'RestaurantsReservations': 'False',
 'BusinessParking': "{'garage': False, 'street': True, 'validated': False, 'lot': True, 'valet': False}",
 'HasTV': 'False',
 'GoodForKids': 'False',
 'RestaurantsTakeOut': 'True',
 'WiFi': "u'free'",
 'NoiseLevel': "u'average'",
 'RestaurantsAttire': "u'casual'",
 'Alcohol': "u'beer_and_wine'",
 'Ambience': "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': True, 'touristy': False, 'trendy': True, 'upscale': False, 'casual': False}"}

In [209]:
#create the tfidf matrix
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(business_df['combined_features'])

In [211]:
#calculate cosine similarity for every place in regard to others
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [255]:
#function to make a recommendation for some user
def get_user_recommendations(user_id, top_n=10):
    # Get the user's highly rated restaurants
    user_reviews = review_df[review_df['user_id'] == user_id]
    high_rated = user_reviews[user_reviews['stars'] >= 5]
    high_rated_business_ids = high_rated['business_id'].unique()
    
    # combine the features of these restaurants
    user_pref = business_df[business_df['business_id'].isin(high_rated_business_ids)]['combined_features'].str.cat(sep=' ')
    
    # transform user preferences into TF-IDF vector
    user_tfidf = tfidf.transform([user_pref])
    
    # compute cosine similarity between user vector and all restaurants
    cosine_sim_user = linear_kernel(user_tfidf, tfidf_matrix)
    
    # get similarity scores
    sim_scores = list(enumerate(cosine_sim_user[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude restaurants the user has already rated
    rated_business_ids = set(user_reviews['business_id'])
    recommendations = []
    for idx, score in sim_scores:
        business_id = business_df.iloc[idx]['business_id']
        if business_id not in rated_business_ids:
            recommendations.append((business_id, score))
        if len(recommendations) >= top_n:
            break
    
    # Return the top recommendations
    recommended_business_ids = [rec[0] for rec in recommendations]
    return business_df[business_df['business_id'].isin(recommended_business_ids)]

In [259]:
# seems like it works, but the recommendations should be close to person
# here we have 2 problems, businesses may be suitable according to attributes, but
# 1) recommendations may be too far away
# 2) overall scores of some places are terrible

# we will address it further
get_user_recommendations('6I1jouo2T1R-X2ykY2T65w', top_n = 20)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,flattened_attributes,combined_features
11439,F3n0sB5Dqn-N2fAA5TgmFQ,China Kitchen,"6280 Sharlands Ave, Ste 102",Reno,NV,89523,39.51713,-119.88598,3.5,141,1,"{'BikeParking': 'True', 'BusinessParking': '{'...","Restaurants, Chinese","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'...","{'BikeParking': 'True', 'NoiseLevel': 'u'avera...",restaurants chinese bikeparking noiselevel_ave...
11658,AtDtwv66pG52TehGR3f6tQ,Randazzo's Pizzeria,3648 Welsh Rd,Willow Grove,PA,19090,40.152574,-75.13743,4.0,40,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsGoo...","Pizza, Restaurants","{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...","{'RestaurantsTakeOut': 'True', 'RestaurantsGoo...",pizza restaurants restaurantstakeout restauran...
26520,sCo3HqBO91ycE37jiKwq5A,Maurizio's Bistro,33 E Main St,Moorestown,NJ,8057,39.9647,-74.945674,3.5,78,1,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...","Italian, Restaurants","{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...","{'RestaurantsAttire': 'u'casual'', 'Restaurant...",italian restaurants restaurantsattire_casual r...
27425,7m9QY137WtxvdhgSUdetaw,River Des Peres Yacht Club,7832 Ivory Ave,Saint Louis,MO,63111,38.548272,-90.264757,4.5,44,1,"{'Caters': 'True', 'WiFi': ''no'', 'NoiseLevel...","Restaurants, Sandwiches","{'Monday': '10:0-19:0', 'Tuesday': '10:0-19:0'...","{'Caters': 'True', 'WiFi': ''no'', 'NoiseLevel...",restaurants sandwiches caters wifi_no noiselev...
34411,6-5hbU746kMNgim0YOOg6w,Urban Cookhouse,1907 Broadway,Nashville,TN,37203,36.150426,-86.79683,4.0,356,1,"{'GoodForMeal': '{'dessert': False, 'latenight...","American (New), Automotive, Caterers, American...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-20:30',...","{'GoodForMeal_lunch': True, 'RestaurantsReserv...",americannew automotive caterers americantradit...
37561,TWwN0AzD_GhAw9_ikBE7dQ,Romano's Macaroni Grill,5100 E Broadway Blvd,Tucson,AZ,85711,32.221247,-110.883279,3.0,175,0,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Italian, Restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...","{'RestaurantsGoodForGroups': 'True', 'Restaura...",italian restaurants restaurantsgoodforgroups r...
37637,UU9jmX08ZRU9DLI36wsKnA,Luke's,"6741 N Thornydale Rd, Ste 157",Tucson,AZ,85741,32.329324,-111.046807,3.5,111,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsPri...","Sandwiches, Restaurants, Italian","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'...","{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",sandwiches restaurants italian restaurantstake...
40808,7VrD6atyAIOPJAEdAvsA1g,Capriotti's Sandwich Shop,120 Lantana Dr,Hockessin,DE,19707,39.772729,-75.710244,3.0,55,1,"{'BikeParking': 'True', 'RestaurantsReservatio...","Sandwiches, Delis, Restaurants","{'Monday': '10:0-20:0', 'Tuesday': '10:0-20:0'...","{'BikeParking': 'True', 'RestaurantsGoodForGro...",sandwiches delis restaurants bikeparking resta...
41592,jzAN3WQf-iJY3AURfh1cYw,Mantis Gourmet Chinese Food,8250 N Cortaro Rd,Marana,AZ,85743,32.356341,-111.091798,4.5,217,1,"{'RestaurantsReservations': 'False', 'Alcohol'...","Chinese, Restaurants","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'...","{'Alcohol': 'u'none'', 'HasTV': 'True', 'GoodF...",chinese restaurants alcohol_none hastv goodfor...
46498,D7ypVwmT8535TsPslvVUQw,Port of Subs,"720 S Meadows Pkwy, Ste 4",Reno,NV,89521,39.437943,-119.757845,2.5,60,1,"{'Ambience': '{'romantic': False, 'intimate': ...","Delis, Restaurants, Sandwiches","{'Monday': '8:0-20:0', 'Tuesday': '8:0-20:0', ...","{'Ambience_casual': True, 'OutdoorSeating': 'T...",delis restaurants sandwiches ambience_casual o...


In [40]:
# Helper Function: Haversine Distance
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points on the Earth.

    Parameters:
    - lat1, lon1: Latitude and longitude of point 1.
    - lat2, lon2: Latitude and longitude of point 2.

    Returns:
    - distance (float): Distance between the two points in kilometers.
    """
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    km = 6371 * c  # Radius of Earth in kilometers
    return km


In [42]:
# Function 1: Estimate User's Preferred Location(s)
def estimate_user_preferred_location(user_id, business_df, review_df, eps_km=5, min_samples=3):
    """
    Estimate the user's preferred location(s) based on their historical reviews using DBSCAN.

    Parameters:
    - user_id (str): The ID of the user.
    - business_df (DataFrame): DataFrame containing business information including latitude and longitude.
    - review_df (DataFrame): DataFrame containing review information.
    - eps_km (float): Maximum distance between points in kilometers for DBSCAN (default is 5 km).
    - min_samples (int): Minimum number of points to form a cluster for DBSCAN (default is 3).

    Returns:
    - user_locations (list of tuples): List of (latitude, longitude) representing user's preferred locations.
    """
    # Get all reviews made by the user
    user_reviews = review_df[review_df['user_id'] == user_id]

    # Merge with business data to get location information
    user_business_reviews = user_reviews.merge(business_df, on='business_id', how='left')

    # Ensure the 'date' column is in datetime format
    user_business_reviews['date'] = pd.to_datetime(user_business_reviews['date'])

    # Extract the latitude and longitude of user's visited restaurants
    coords = user_business_reviews[['latitude', 'longitude']].dropna().values

    if len(coords) == 0:
        # If no location data is available, return an empty list
        return []

    # Convert coordinates to radians for haversine metric
    coords_rad = np.radians(coords)

    # Convert eps from kilometers to radians (Earth's radius ~6371 km)
    earth_radius_km = 6371
    eps_rad = eps_km / earth_radius_km

    # Apply DBSCAN clustering
    db = DBSCAN(eps=eps_rad, min_samples=min_samples, metric='haversine').fit(coords_rad)
    clusters = db.labels_

    # Append cluster labels to user_business_reviews
    user_business_reviews['cluster'] = clusters

    # Exclude noise points (cluster = -1)
    cluster_counts = user_business_reviews['cluster'].value_counts()
    if (cluster_counts.index == -1).any():
        cluster_counts = cluster_counts[cluster_counts.index != -1]
    
    if cluster_counts.empty:
        # If no clusters found, return the centroid of all visited locations
        centroid = (coords[:, 0].mean(), coords[:, 1].mean())
        return [centroid]

    # Identify the densest cluster
    densest_cluster = cluster_counts.idxmax()
    cluster_points = user_business_reviews[user_business_reviews['cluster'] == densest_cluster][['latitude', 'longitude']]

    # Calculate the geographic center (centroid) of the densest cluster
    cluster_center = cluster_points.mean().to_list()

    return [tuple(cluster_center)]


In [44]:
# Function 2: Build TF-IDF Model
def build_tfidf_model(business_df):
    """
    Build the TF-IDF vectorizer and transform the business data into a TF-IDF matrix.

    Parameters:
    - business_df (DataFrame): DataFrame containing business information.

    Returns:
    - tfidf_vectorizer (TfidfVectorizer): The fitted TF-IDF vectorizer.
    - tfidf_matrix (sparse matrix): TF-IDF feature matrix for the businesses.
    """
    # Combine categories and attributes into a single text field
    business_df['combined_features'] = business_df['categories'].fillna('') + ' ' + business_df['attributes'].fillna('').apply(
        lambda x: ' '.join(x) if isinstance(x, list) else str(x)
    )

    # Text preprocessing
    business_df['combined_features'] = business_df['combined_features'].str.lower()
    business_df['combined_features'] = business_df['combined_features'].str.replace('[^\w\s]', ' ', regex=True)

    # Build the TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')

    # Fit and transform the combined features
    tfidf_matrix = tfidf_vectorizer.fit_transform(business_df['combined_features'])

    return tfidf_vectorizer, tfidf_matrix

  business_df['combined_features'] = business_df['combined_features'].str.replace('[^\w\s]', ' ', regex=True)


In [46]:
 #Function 3: Make Recommendations
def make_recommendations(user_id, business_df, review_df, tfidf_vectorizer, tfidf_matrix, user_locations, top_n=10, max_distance=20, content_weight=0.5, distance_weight=0.1, rating_weight=0.4):
    """
    Generate restaurant recommendations for a user, considering content similarity, proximity, and overall ratings.

    Parameters:
    - user_id (str): The ID of the user.
    - business_df (DataFrame): DataFrame containing business information.
    - review_df (DataFrame): DataFrame containing review information.
    - tfidf_vectorizer (TfidfVectorizer): The fitted TF-IDF vectorizer.
    - tfidf_matrix (sparse matrix): TF-IDF feature matrix for the businesses.
    - user_locations (list of tuples): List of user's preferred locations (latitude, longitude).
    - top_n (int): Number of recommendations to return (default is 10).
    - max_distance (float): Maximum distance (in km) to consider for recommendations (default is 20 km).
    - content_weight (float): Weight for content similarity in combined score (default is 0.5).
    - distance_weight (float): Weight for proximity in combined score (default is 0.3).
    - rating_weight (float): Weight for overall rating in combined score (default is 0.2).

    Returns:
    - recommendations_df (DataFrame): DataFrame containing the top recommended businesses.
    """
    # Ensure that the weights sum to 1
    total_weight = content_weight + distance_weight + rating_weight
    content_weight /= total_weight
    distance_weight /= total_weight
    rating_weight /= total_weight

    # Get the user's reviews
    user_reviews = review_df[review_df['user_id'] == user_id]

    # Identify businesses the user has already reviewed
    rated_business_ids = set(user_reviews['business_id'])

    # Get the user's highly rated restaurants
    high_rated = user_reviews[user_reviews['stars'] >= 4]
    high_rated_business_ids = high_rated['business_id'].unique()

    # Fallback: If no positive reviews, recommend nearby places with high ratings
    if len(high_rated_business_ids) == 0:
        print(f"User {user_id} has no positive reviews. Recommending nearby popular places.")
        # Compute distances to user's preferred locations
        def min_distance_to_user(row, user_locations):
            distances = [haversine_distance(loc[0], loc[1], row['latitude'], row['longitude']) for loc in user_locations]
            return min(distances)

        business_df['distance_to_user'] = business_df.apply(
            lambda row: min_distance_to_user(row, user_locations),
            axis=1
        )

        # Filter businesses within max_distance and with high ratings
        recommendations = business_df[
            (business_df['distance_to_user'] <= max_distance) &
            (business_df['stars'] >= 4.0)
        ].copy()

        # Exclude businesses the user has already reviewed
        recommendations = recommendations[~recommendations['business_id'].isin(rated_business_ids)]

        # Sort by stars (descending) and distance_to_user (ascending)
        recommendations = recommendations.sort_values(by=['stars', 'distance_to_user'], ascending=[False, True])

        return recommendations.head(top_n)

    # Combine the features of these restaurants to create the user's preference profile
    user_pref = business_df[business_df['business_id'].isin(high_rated_business_ids)]['combined_features'].str.cat(sep=' ')

    # Transform user preferences into TF-IDF vector
    user_tfidf = tfidf_vectorizer.transform([user_pref])

    # Compute cosine similarity between user vector and all restaurants
    cosine_sim_user = linear_kernel(user_tfidf, tfidf_matrix).flatten()

    # Compute distances to user's preferred locations
    def min_distance_to_user(row, user_locations):
        distances = [haversine_distance(loc[0], loc[1], row['latitude'], row['longitude']) for loc in user_locations]
        return min(distances)

    business_df['distance_to_user'] = business_df.apply(
        lambda row: min_distance_to_user(row, user_locations),
        axis=1
    )

    # Normalize overall ratings
    max_rating = business_df['stars'].max()
    min_rating = business_df['stars'].min()
    business_df['normalized_rating'] = (business_df['stars'] - min_rating) / (max_rating - min_rating)

    # Prepare the recommendations
    recommendations = []

    for idx in range(len(business_df)):
        business_id = business_df.iloc[idx]['business_id']

        # Exclude restaurants the user has already visited
        if business_id in rated_business_ids:
            continue

        # Get content similarity score
        content_sim = cosine_sim_user[idx]

        # Get distance to user's preferred location(s)
        distance = business_df.iloc[idx]['distance_to_user']

        # Exclude restaurants beyond the maximum distance
        if distance > max_distance:
            continue

        # Normalize distance score
        distance_score = 1 - (distance / max_distance)
        distance_score = np.clip(distance_score, 0, 1)

        # Get normalized rating
        normalized_rating = business_df.iloc[idx]['normalized_rating']

        # Combine content similarity, distance score, and overall rating
        combined_score = (
            content_sim * content_weight +
            distance_score * distance_weight +
            normalized_rating * rating_weight
        )

        # Only consider businesses with a positive combined score
        if combined_score > 0:
            recommendations.append({
                'business_id': business_id,
                'name': business_df.iloc[idx]['name'],
                'address': business_df.iloc[idx]['address'],
                'city': business_df.iloc[idx]['city'],
                'stars': business_df.iloc[idx]['stars'],
                'distance_to_user': distance,
                'score': combined_score
            })

    # If no recommendations found, return an empty DataFrame
    if not recommendations:
        print("No recommendations found within the specified parameters.")
        return pd.DataFrame()

    # Sort recommendations by the combined score
    recommendations = sorted(recommendations, key=lambda x: x['score'], reverse=True)

    # Get the top N recommendations
    top_recommendations = recommendations[:top_n]

    # Convert to DataFrame
    recommendations_df = pd.DataFrame(top_recommendations)

    return recommendations_df

In [48]:
# Build the TF-IDF model
tfidf_vectorizer, tfidf_matrix = build_tfidf_model(business_df)

# Specify the user ID
user_id = '6I1jouo2T1R-X2ykY2T65w' # Replace with the actual user ID

# Estimate user's preferred location(s)
user_locations = estimate_user_preferred_location(user_id, business_df, review_df, eps_km=5, min_samples=3)

if not user_locations:
    print(f"No location data available for user {user_id}.")
else:
    print(f"Estimated user preferred locations: {user_locations}")

    # Generate recommendations
    recommendations_df = make_recommendations(
        user_id=user_id,
        business_df=business_df,
        review_df=review_df,
        tfidf_vectorizer=tfidf_vectorizer,
        tfidf_matrix=tfidf_matrix,
        user_locations=user_locations,
        top_n=10,
        max_distance=20,
        content_weight=0.6,
        distance_weight=0.15,
        rating_weight=0.25
    )

    # Display the recommendations
    if not recommendations_df.empty:
        print("Top Recommendations:")
        print(recommendations_df[['name', 'address', 'city', 'stars', 'distance_to_user', 'score']])
    else:
        print("No recommendations could be generated.")

Estimated user preferred locations: [(38.683815984975, -90.435681838325)]
Top Recommendations:
                                     name                address  \
0                             Dino's Deli       510 Westport Plz   
1                            Gioia's Deli    623 N New Ballas Rd   
2            Carreta's Mexican Restaurant    11939 Olive Blvd St   
3                Hilary's Roadhouse Diner       11488 Dorsett Rd   
4                            King Burrito     11084 Midland Blvd   
5                          Sides Of Seoul         10084 Page Ave   
6  Fred and Ricky's Plant Delicious Foods         64 Weldon Pkwy   
7                           Lion's Choice       12010 Olive Blvd   
8                            O! Wing Plus         10094 Page Ave   
9                      La Tejana Taqueria  3157 N Lindbergh Blvd   

               city  stars  distance_to_user     score  
0       Saint Louis    4.5          2.172653  0.931441  
1       Creve Coeur    4.5          1.9816

In [49]:
#look alright but let's consider the price range too

In [71]:
def extract_price_range(business_df):
    """
    Extracts and preprocesses the price range data from the business DataFrame.

    Parameters:
    - business_df (DataFrame): DataFrame containing business information.

    Returns:
    - business_df (DataFrame): Updated DataFrame with a new 'price_range' column.
    """
    # Extract the 'RestaurantsPriceRange2' attribute
    # Some entries may not have 'RestaurantsPriceRange2'
    business_df['price_range'] = business_df['RestaurantsPriceRange2'].fillna('Unknown')
    
    # Convert price range to numeric, handle 'Unknown' or missing values
    business_df['price_range'] = pd.to_numeric(business_df['price_range'], errors='coerce')
    
    # Handle missing values by replacing them with the median price range
    median_price = business_df['price_range'].median()
    business_df['price_range'] = business_df['price_range'].fillna(median_price)
    
    return business_df


In [73]:
def normalize_price_range(business_df):
    """
    Normalizes the price range values to a scale between 0 and 1.

    Parameters:
    - business_df (DataFrame): DataFrame containing business information with 'price_range'.

    Returns:
    - business_df (DataFrame): Updated DataFrame with a new 'normalized_price' column.
    """
    # Normalize price range between 0 and 1
    max_price = business_df['price_range'].max()
    min_price = business_df['price_range'].min()
    business_df['normalized_price'] = (business_df['price_range'] - min_price) / (max_price - min_price)
    
    # Invert the score if lower prices are preferred (optional)
    business_df['normalized_price'] = 1 - business_df['normalized_price']
    
    return business_df


In [75]:
def make_recommendations(
    user_id,
    business_df,
    review_df,
    tfidf_vectorizer,
    tfidf_matrix,
    user_locations,
    top_n=10,
    max_distance=20,
    content_weight=0.4,
    distance_weight=0.3,
    rating_weight=0.2,
    price_weight=0.1
):
    # Ensure that the weights sum to 1
    total_weight = content_weight + distance_weight + rating_weight + price_weight
    content_weight /= total_weight
    distance_weight /= total_weight
    rating_weight /= total_weight
    price_weight /= total_weight

    # Get the user's reviews
    user_reviews = review_df[review_df['user_id'] == user_id]

    # Identify businesses the user has already reviewed
    rated_business_ids = set(user_reviews['business_id'])

    # Get the user's highly rated restaurants
    high_rated = user_reviews[user_reviews['stars'] >= 4]
    high_rated_business_ids = high_rated['business_id'].unique()

    # Fallback: If no positive reviews, recommend nearby places with high ratings
    if len(high_rated_business_ids) == 0:
        print(f"User {user_id} has no positive reviews. Recommending nearby popular places.")
        # Compute distances to user's preferred locations
        def min_distance_to_user(row, user_locations):
            distances = [haversine_distance(loc[0], loc[1], row['latitude'], row['longitude']) for loc in user_locations]
            return min(distances)

        business_df['distance_to_user'] = business_df.apply(
            lambda row: min_distance_to_user(row, user_locations),
            axis=1
        )

        # Filter businesses within max_distance and with high ratings
        recommendations = business_df[
            (business_df['distance_to_user'] <= max_distance) &
            (business_df['stars'] >= 4.0)
        ].copy()

        # Exclude businesses the user has already reviewed
        recommendations = recommendations[~recommendations['business_id'].isin(rated_business_ids)]

        # Sort by stars (descending) and distance_to_user (ascending)
        recommendations = recommendations.sort_values(by=['stars', 'distance_to_user'], ascending=[False, True])

        return recommendations.head(top_n)

    # Combine the features of these restaurants to create the user's preference profile
    user_pref = business_df[business_df['business_id'].isin(high_rated_business_ids)]['combined_features'].str.cat(sep=' ')

    # Transform user preferences into TF-IDF vector
    user_tfidf = tfidf_vectorizer.transform([user_pref])

    # Compute cosine similarity between user vector and all restaurants
    cosine_sim_user = linear_kernel(user_tfidf, tfidf_matrix).flatten()

    # Compute distances to user's preferred locations
    def min_distance_to_user(row, user_locations):
        distances = [haversine_distance(loc[0], loc[1], row['latitude'], row['longitude']) for loc in user_locations]
        return min(distances)

    business_df['distance_to_user'] = business_df.apply(
        lambda row: min_distance_to_user(row, user_locations),
        axis=1
    )

    # Normalize overall ratings
    max_rating = business_df['stars'].max()
    min_rating = business_df['stars'].min()
    business_df['normalized_rating'] = (business_df['stars'] - min_rating) / (max_rating - min_rating)

    # Prepare the recommendations
    recommendations = []

    for idx in range(len(aligned_business_df)):
        business_id = aligned_business_df.iloc[idx]['business_id']
    
        # Exclude restaurants the user has already visited
        if business_id in rated_business_ids:
            continue
    
        # Map the business_id to the row index in tfidf_matrix
        try:
            matrix_idx = train_business_df[train_business_df['business_id'] == business_id].index[0]
        except IndexError:
            continue  # Skip businesses not in the tfidf_matrix
    
        # Get content similarity score
        content_sim = cosine_sim_user[matrix_idx]
    
        # Continue with the rest of the recommendation logic

        # Get distance to user's preferred location(s)
        distance = business_df.iloc[idx]['distance_to_user']

        # Exclude restaurants beyond the maximum distance
        if distance > max_distance:
            continue

        # Normalize distance score
        distance_score = 1 - (distance / max_distance)
        distance_score = np.clip(distance_score, 0, 1)

        # Get normalized rating
        normalized_rating = business_df.iloc[idx]['normalized_rating']

        # Get normalized price
        normalized_price = business_df.iloc[idx]['normalized_price']

        # Combine content similarity, distance score, overall rating, and price range
        combined_score = (
            content_sim * content_weight +
            distance_score * distance_weight +
            normalized_rating * rating_weight +
            normalized_price * price_weight
        )

        # Only consider businesses with a positive combined score
        if combined_score > 0:
            recommendations.append({
                'business_id': business_id,
                'name': business_df.iloc[idx]['name'],
                'address': business_df.iloc[idx]['address'],
                'city': business_df.iloc[idx]['city'],
                'stars': business_df.iloc[idx]['stars'],
                'price_range': business_df.iloc[idx]['price_range'],
                'distance_to_user': distance,
                'score': combined_score
            })

    # If no recommendations found, return an empty DataFrame
    if not recommendations:
        print("No recommendations found within the specified parameters.")
        return pd.DataFrame()

    # Sort recommendations by the combined score
    recommendations = sorted(recommendations, key=lambda x: x['score'], reverse=True)

    # Get the top N recommendations
    top_recommendations = recommendations[:top_n]

    # Convert to DataFrame
    recommendations_df = pd.DataFrame(top_recommendations)

    return recommendations_df


In [77]:
def extract_price_range(business_df):
    """
    Extracts and preprocesses the price range data from the business DataFrame.

    Parameters:
    - business_df (DataFrame): DataFrame containing business information.

    Returns:
    - business_df (DataFrame): Updated DataFrame with a new 'price_range' column.
    """
    # Extract 'RestaurantsPriceRange2' from the 'attributes' column if it exists
    def get_price_range(attr):
        if isinstance(attr, dict) and 'RestaurantsPriceRange2' in attr:
            return attr['RestaurantsPriceRange2']
        elif isinstance(attr, str):
            try:
                attr_dict = eval(attr)
                return attr_dict.get('RestaurantsPriceRange2', None)
            except:
                return None
        return None

    business_df['price_range'] = business_df['attributes'].apply(get_price_range)
    
    # Convert price range to numeric, handle missing values
    business_df['price_range'] = pd.to_numeric(business_df['price_range'], errors='coerce')
    median_price = business_df['price_range'].median()
    business_df['price_range'] = business_df['price_range'].fillna(median_price)
    
    return business_df


In [79]:
# Check if 'RestaurantsPriceRange2' exists in attributes
if 'attributes' in business_df.columns:
    try:
        business_df = extract_price_range(business_df)
        business_df = normalize_price_range(business_df)
    except KeyError:
        print("'RestaurantsPriceRange2' not found. Assigning default price ranges.")
        business_df['price_range'] = 2  # Assign default median value
        business_df['normalized_price'] = 0.5  # Neutral normalized value
else:
    print("'attributes' column not found. Skipping price range handling.")
    business_df['price_range'] = 2  # Assign default median value
    business_df['normalized_price'] = 0.5  # Neutral normalized value

# Build the TF-IDF model
tfidf_vectorizer, tfidf_matrix = build_tfidf_model(business_df)

# Specify the user ID
user_id = '6I1jouo2T1R-X2ykY2T65w'  # Replace with the actual user ID

# Estimate user's preferred location(s)
user_locations = estimate_user_preferred_location(user_id, business_df, review_df, eps_km=5, min_samples=3)

if not user_locations:
    print(f"No location data available for user {user_id}.")
else:
    print(f"Estimated user preferred locations: {user_locations}")

    # Generate recommendations
    recommendations_df = make_recommendations(
        user_id=user_id,
        business_df=business_df,
        review_df=review_df,
        tfidf_vectorizer=tfidf_vectorizer,
        tfidf_matrix=tfidf_matrix,
        user_locations=user_locations,
        top_n=10,
        max_distance=20,
        content_weight=0.4,
        distance_weight=0.3,
        rating_weight=0.2,
        price_weight=0.1  # Adjust as desired
    )

    # Display the recommendations
    if not recommendations_df.empty:
        print("Top Recommendations:")
        print(recommendations_df[['name', 'address', 'city', 'stars', 'price_range', 'distance_to_user', 'score']])
    else:
        print("No recommendations could be generated.")


Estimated user preferred locations: [(38.683815984975, -90.435681838325)]


IndexError: index 52271 is out of bounds for axis 0 with size 52268

In [None]:
def split_train_test(review_df, test_size=0.2):
    """
    Splits the dataset into train and test sets, with the last `test_size` proportion of reviews 
    for each user assigned to the test set.

    Parameters:
    - review_df (DataFrame): DataFrame containing review data.
    - test_size (float): Proportion of data to include in the test set for each user.

    Returns:
    - train_df (DataFrame): Training set.
    - test_df (DataFrame): Test set.
    """
    train_list = []
    test_list = []

    # Group reviews by user
    grouped = review_df.groupby('user_id')

    for user_id, group in grouped:
        # Sort reviews by date for each user
        group = group.sort_values(by='date')

        # Calculate the split index
        split_idx = int(len(group) * (1 - test_size))

        # Split into train and test sets
        train_list.append(group.iloc[:split_idx])
        test_list.append(group.iloc[split_idx:])

    # Combine all users' data back into train and test DataFrames
    train_df = pd.concat(train_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True)

    return train_df, test_df


In [None]:
def evaluate_full_model(test_df, business_df, train_df, tfidf_vectorizer, tfidf_matrix, top_n=10):
    """
    Evaluates the full recommendation model using Precision@N and Recall@N.

    Parameters:
    - test_df (DataFrame): Test set containing user reviews.
    - business_df (DataFrame): DataFrame containing business information.
    - train_df (DataFrame): Training set containing user reviews.
    - tfidf_vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
    - tfidf_matrix (sparse matrix): TF-IDF feature matrix for businesses.
    - top_n (int): Number of recommendations to consider for evaluation.

    Returns:
    - precision (float): Precision@N.
    - recall (float): Recall@N.
    """
    precision_list = []
    recall_list = []

    # Get unique users in the test set
    users = test_df['user_id'].unique()

    for user_id in users:
        # Generate recommendations for the user using the full model
        user_locations = estimate_user_preferred_location(user_id, business_df, train_df)
        if not user_locations:
            continue

        recommendations_df = make_recommendations(
            user_id=user_id,
            business_df=business_df,
            review_df=train_df,  # Use training set for recommendations
            tfidf_vectorizer=tfidf_vectorizer,
            tfidf_matrix=tfidf_matrix,
            user_locations=user_locations,
            top_n=top_n,
            max_distance=20,
            content_weight=0.4,
            distance_weight=0.3,
            rating_weight=0.2,
            price_weight=0.1
        )

        # Get the actual businesses reviewed by the user in the test set
        actual_businesses = set(test_df[test_df['user_id'] == user_id]['business_id'])

        # Get the recommended businesses
        recommended_businesses = set(recommendations_df['business_id'])

        # Calculate Precision and Recall
        relevant_recommendations = actual_businesses.intersection(recommended_businesses)
        precision = len(relevant_recommendations) / len(recommended_businesses) if recommended_businesses else 0
        recall = len(relevant_recommendations) / len(actual_businesses) if actual_businesses else 0

        precision_list.append(precision)
        recall_list.append(recall)

    # Average precision and recall across all users
    precision = np.mean(precision_list)
    recall = np.mean(recall_list)

    return precision, recall


In [None]:
# Step 1: Split the dataset
train_df, test_df = split_train_test(review_df, test_size=0.2)

# Step 2: Filter businesses in the training set
train_business_ids = set(train_df['business_id'])
train_business_df = business_df[business_df['business_id'].isin(train_business_ids)]

# Step 3: Train the TF-IDF model using the training businesses
tfidf_vectorizer, tfidf_matrix = build_tfidf_model(train_business_df)

# Step 4: Align business_df for recommendations
valid_business_ids = set(train_business_df['business_id'])
aligned_business_df = business_df[business_df['business_id'].isin(valid_business_ids)].reset_index(drop=True)

# Step 5: Evaluate the model
precision, recall = evaluate_full_model(
    test_df=test_df,
    business_df=aligned_business_df,  # Use aligned business_df
    train_df=train_df,
    tfidf_vectorizer=tfidf_vectorizer,
    tfidf_matrix=tfidf_matrix,
    top_n=10
)

print(f"Precision@10: {precision:.4f}")
print(f"Recall@10: {recall:.4f}")

In [None]:
# Align business_df with the businesses in the tfidf_matrix
valid_business_ids = train_business_df['business_id'].tolist()  # Use business IDs from the training set
aligned_business_df = business_df[business_df['business_id'].isin(valid_business_ids)].reset_index(drop=True)


In [None]:
assert len(aligned_business_df) == tfidf_matrix.shape[0], "Mismatch between businesses and TF-IDF matrix."


In [69]:
# Check for mismatched business IDs
mismatched_ids = set(aligned_business_df['business_id']) - set(train_business_df['business_id'])
print(f"Mismatched Business IDs: {mismatched_ids}")

Mismatched Business IDs: set()
