### Yelp Dataset 10Mar2025 Validations or Proof of Concepts
#### 1. Extrat Tar File

In [None]:
#! pip install geopandas

In [None]:
from src import modules as f
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

# Mapping: state abbreviation to full name
us_state_abbrev_to_name = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia'
}

# Load business data
business_df = pd.read_parquet("./data/business.parquet")
state_counts = business_df.groupby("state").size().reset_index(name="business_count")
state_counts["state"] = state_counts["state"].str.upper()
state_counts["state_name"] = state_counts["state"].map(us_state_abbrev_to_name)

# Load GeoJSON
us_states_url = "https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_500k.json"
us_states = gpd.read_file(us_states_url)

# Merge using full state names
merged = us_states.merge(state_counts, left_on="NAME", right_on="state_name", how="left")
merged["business_count"] = merged["business_count"].fillna(0).astype(int)

# Plot
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
merged.plot(
    column="business_count",
    ax=ax,
    legend=True,
    cmap="OrRd",
    edgecolor="black",
    legend_kwds={'label': "Number of Yelp Businesses", 'shrink': 0.5}
)
ax.set_title("Yelp Business Count by State", fontsize=16)
ax.set_xlim([-180, -60])  # include AK
ax.set_ylim([15, 75])     # include HI
plt.axis("off")
plt.show()



In [None]:
print("Sample Yelp states:", state_counts['state'].unique())

In [None]:
# Business Summary Table

# Load all necessary data
review_df = pd.read_parquet("./data/review.parquet")
checkin_df = pd.read_parquet("./data/checkin.parquet")
tip_df = pd.read_parquet("./data/tip.parquet")
photo_df = pd.read_parquet("./data/photo.parquet")

# --- Aggregations ---
# Unique users per business (from review and tip)
users_from_reviews = review_df.groupby("business_id")["user_id"].nunique()
users_from_tips = tip_df.groupby("business_id")["user_id"].nunique()

# Combine both sources of users
total_users = users_from_reviews.add(users_from_tips, fill_value=0).astype(int)

# Total reviews
total_reviews = review_df.groupby("business_id").size()

# Total checkins (count timestamps)
checkin_df["checkin_count"] = checkin_df["date"].str.split(",").apply(len)
total_checkins = checkin_df.groupby("business_id")["checkin_count"].sum()

# Total tips
total_tips = tip_df.groupby("business_id").size()

# Total photos
total_photos = photo_df.groupby("business_id").size()

# --- Combine all metrics ---
agg_df = pd.DataFrame({
    "total_users": total_users,
    "total_reviews": total_reviews,
    "total_checkins": total_checkins,
    "total_tips": total_tips,
    "total_photos": total_photos
}).fillna(0).astype(int)

agg_df.reset_index(inplace=True)  # make business_id a column

# Display
from IPython.display import display
display(agg_df.head())

In [None]:
display(agg_df.shape)

In [None]:
# User summary table
# --- total_reviews & total_businesses_visited ---
review_stats = review_df.groupby("user_id").agg(
    total_reviews=("review_id", "count"),
    total_businesses_visited=("business_id", "nunique")
)

# --- total_checkins ---
# Count check-ins per business
checkin_expanded = checkin_df.copy()
checkin_expanded["checkin_count"] = checkin_expanded["date"].str.split(",").apply(len)

# Join with review_df to associate users to check-ins via business_id
checkin_with_users = checkin_expanded.merge(
    review_df[["user_id", "business_id"]],
    on="business_id",
    how="left"
)

# Sum check-ins by user
user_checkins = checkin_with_users.groupby("user_id")["checkin_count"].sum()

# --- total_tips_written ---
user_tips = tip_df.groupby("user_id").size().rename("total_tips_written")

# --- Merge all ---
user_summary = review_stats.copy()
user_summary["total_checkins"] = user_checkins
user_summary["total_tips_written"] = user_tips

# Final cleanup
user_summary = user_summary.fillna(0).astype(int).reset_index()

# Display
from IPython.display import display
display(user_summary.head())

In [None]:
display(user_summary.shape)

In [None]:
import psutil
mem = psutil.virtual_memory()
print(f"Used: {mem.used / 1e9:.2f} GB / Total: {mem.total / 1e9:.2f} GB")

### Recommender Model

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress convergence warnings from NMF
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load business + review data
business_df = pd.read_parquet("./data/business.parquet")
review_df = pd.read_parquet("./data/review.parquet")

# Filter California businesses with business_id and name
ca_businesses = business_df[business_df["state"] == "CA"][["business_id", "name"]]

# Join business name into reviews
ca_reviews = review_df.merge(ca_businesses, on="business_id")

# Create pivot table: rows = user_id, columns = business_id, values = count of reviews
pivot_df = ca_reviews.pivot_table(
    index="user_id",
    columns="business_id",
    values="review_id",
    aggfunc="count",
    fill_value=0
)

from sklearn.decomposition import NMF

# Train NMF model
nmf_model = NMF(n_components=11, init='random', random_state=42, max_iter=500,) # increase from 200
user_features = nmf_model.fit_transform(pivot_df)
business_features = nmf_model.components_

# Save for recommendation
user_index = pivot_df.index
business_columns = pivot_df.columns

import numpy as np
import pandas as pd

# For fast lookup of names
business_id_to_name = ca_businesses.set_index("business_id")["name"].to_dict()

def recommend_top_5(user_id: str):
    if user_id not in pivot_df.index:
        return f"User {user_id} not found in California data."

    user_idx = list(pivot_df.index).index(user_id)
    user_vector = user_features[user_idx]

    # Predict all business scores
    user_pred = np.dot(user_vector, business_features)

    # Get already reviewed businesses
    reviewed = pivot_df.loc[user_id]
    reviewed_businesses = reviewed[reviewed > 0].index

    # Filter out businesses the user already reviewed
    recommendations = [
        (biz_id, score) for biz_id, score in zip(business_columns, user_pred)
        if biz_id not in reviewed_businesses
    ]

    # Sort by predicted score
    top_5 = sorted(recommendations, key=lambda x: x[1], reverse=True)[:5]

    # Add names
    result = [{
        "business_id": biz_id,
        "name": business_id_to_name.get(biz_id, "Unknown"),
        "predicted_score": round(score, 4)
    } for biz_id, score in top_5]

    return pd.DataFrame(result)

In [None]:
import psutil
mem = psutil.virtual_memory()
print(f"Used: {mem.used / 1e9:.2f} GB / Total: {mem.total / 1e9:.2f} GB")

In [None]:
ca_reviews.sample(5)

In [None]:
recommend_top_5("ptDybsokuV3T_E7phLR28w")  # Replace with real user_id from CA

In [None]:
!pip uninstall tensorflow -y

In [None]:
!pip install tensorflow[and-cuda]

In [None]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

In [None]:
# load for the first time to set up files
f.json_2_parquet()

In [None]:
import os
from pyspark.sql import SparkSession

# Set the correct network IP if necessary
os.environ["SPARK_LOCAL_IP"] = "192.168.5.29"  # or "192.168.5.29" if explicitly needed

# Build Spark session WITHOUT RAPIDS
spark = SparkSession.builder \
    .appName("SparkNoGPU") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to reduce verbosity
spark.sparkContext.setLogLevel("ERROR")

print("Spark session created!")


#### 2. Import JSON Tables

In [None]:
# Load each Parquet file into a Spark DataFrame
business_df = spark.read.parquet(os.path.join(parquet_dir, 'business.parquet')).sample(False, 0.1, seed=42)
# review_df   = spark.read.parquet(os.path.join(parquet_dir, 'review.parquet')).sample(False, 0.1, seed=42)
# checkin_df  = spark.read.parquet(os.path.join(parquet_dir, 'checkin.parquet')).sample(False, 0.1, seed=42)
# tip_df      = spark.read.parquet(os.path.join(parquet_dir, 'tip.parquet')).sample(False, 0.1, seed=42)
# user_df     = spark.read.parquet(os.path.join(parquet_dir, 'user.parquet')).sample(False, 0.1, seed=42)

# Show a sample from one DataFrame
business_df.show(5)

In [None]:
spark.stop()