# Phase 1

In [1]:
# Pre-analysis data wrangling

import pandas as pd
import ast
from pathlib import Path

# Load raw data
data_path = Path("../data/study1_venues_20250117.csv")
df_raw = pd.read_csv(data_path)

print(f"Total venues in dataset: {len(df_raw)}")
print(f"\nColumns: {list(df_raw.columns)}")

# Check reviews column
print(f"\n{'='*60}")
print("REVIEW DATA CHECK")
print(f"{'='*60}")

# Count venues with/without reviews
def count_reviews(raw_val):
    """Count number of reviews in a venue's review data."""
    if pd.isna(raw_val):
        return 0
    try:
        if isinstance(raw_val, str):
            reviews = ast.literal_eval(raw_val)
        else:
            reviews = raw_val
        if isinstance(reviews, list):
            return len(reviews)
    except Exception:
        return 0
    return 0

df_raw['review_count'] = df_raw['google_reviews'].apply(count_reviews)

venues_with_reviews = (df_raw['review_count'] > 0).sum()
venues_without_reviews = (df_raw['review_count'] == 0).sum()

print(f"\nVenues WITH reviews: {venues_with_reviews} ({venues_with_reviews/len(df_raw)*100:.1f}%)")
print(f"Venues WITHOUT reviews: {venues_without_reviews} ({venues_without_reviews/len(df_raw)*100:.1f}%)")

# Review count distribution
print(f"\nReview count distribution:")
print(df_raw['review_count'].describe())

# Show venues without reviews (if any)
if venues_without_reviews > 0:
    print(f"\n⚠️  Venues without reviews:")
    no_reviews = df_raw[df_raw['review_count'] == 0][['id', 'name']].head(10)
    print(no_reviews.to_string(index=False))
    if venues_without_reviews > 10:
        print(f"... and {venues_without_reviews - 10} more")

# Show venues with most reviews
print(f"\nTop 10 venues by review count:")
top_reviews = df_raw.nlargest(10, 'review_count')[['id', 'name', 'review_count']]
print(top_reviews.to_string(index=False))

print(f"\n{'='*60}")
print("✅ Data check complete")
print(f"{'='*60}")



Total venues in dataset: 596

Columns: ['address', 'latLng', 'allDetails', 'venueStatus', 'googlePlaceId', 'googleMapsPlaceId', 'googleTagsLastSynced', 'admin', 'createdBy_UserID', 'cityId', 'voraaTagThree', 'topVoraaTags', 'images', 'placeIdAddedBy', 'googleMapsTags', 'name', 'voraaTagOne', 'voraaTagTwo', 'updatedAt', 'associatedDeals', 'timezone', 'id', 'setToActiveDate', 'openingHours', 'venueDealsInfo', 'lastUpdated', 'createdDate', 'contactAndURLs', 'partnerAccountID', 'synced', 'allRatings', 'venueVideo', 'description', 'associatedHappyHours', 'venueHappyHoursInfo', 'legalVenueVerification', 'video', 'place_description', 'google_reviews']

REVIEW DATA CHECK

Venues WITH reviews: 555 (93.1%)
Venues WITHOUT reviews: 41 (6.9%)

Review count distribution:
count    596.000000
mean       4.592282
std        1.320761
min        0.000000
25%        5.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: review_count, dtype: float64

⚠️  Venues without reviews:
        

In [2]:
# Check what load_venue_data() actually returns (after filtering)
import sys
from pathlib import Path

# Add src to path to import gentags
sys.path.insert(0, str(Path("../src").resolve()))

from gentags.data import load_venue_data

# Load using the actual function (this filters out venues with no reviews)
df_loaded = load_venue_data("../data/study1_venues_20250117.csv")

print(f"\n{'='*60}")
print("LOAD_VENUE_DATA() RESULTS")
print(f"{'='*60}")
print(f"\nVenues loaded (after filtering): {len(df_loaded)}")
print(f"Venues with reviews in loaded data: {(df_loaded['google_reviews'].apply(len) > 0).sum()}")

# Check review counts in loaded data
review_counts_loaded = df_loaded['google_reviews'].apply(len)
print(f"\nReview count distribution (loaded data):")
print(review_counts_loaded.describe())

# Compare: raw vs loaded
print(f"\n{'='*60}")
print("COMPARISON")
print(f"{'='*60}")
print(f"Raw CSV total venues: {len(df_raw)}")
print(f"Raw CSV venues WITH reviews: {venues_with_reviews}")
print(f"Raw CSV venues WITHOUT reviews: {venues_without_reviews}")
print(f"\nAfter load_venue_data() filtering: {len(df_loaded)} venues")
print(f"Difference: {venues_with_reviews - len(df_loaded)} venues filtered out")

# Check if any loaded venues have empty review lists (shouldn't happen)
empty_after_load = (df_loaded['google_reviews'].apply(len) == 0).sum()
if empty_after_load > 0:
    print(f"\n⚠️  WARNING: {empty_after_load} venues in loaded data have empty review lists!")
else:
    print(f"\n✅ All loaded venues have reviews (as expected)")


LOAD_VENUE_DATA() RESULTS

Venues loaded (after filtering): 553
Venues with reviews in loaded data: 553

Review count distribution (loaded data):
count    553.000000
mean       4.880651
std        0.551879
min        1.000000
25%        5.000000
50%        5.000000
75%        5.000000
max        5.000000
Name: google_reviews, dtype: float64

COMPARISON
Raw CSV total venues: 596
Raw CSV venues WITH reviews: 555
Raw CSV venues WITHOUT reviews: 41

After load_venue_data() filtering: 553 venues
Difference: 2 venues filtered out

✅ All loaded venues have reviews (as expected)



UPDATING CSV

Original rows: 596
Rows with reviews: 555
Rows removed: 41

✅ Updated CSV saved to ../data/study1_venues_20250117.csv
✅ CSV now contains only venues with reviews
