# Dataset Curation

For this demo, we download and aggregate the [Yelp Open Dataset](https://business.yelp.com/data/resources/open-dataset/) to get reviews for local cafes. We'll take a random sample that's large enough to be representative of the distribution.

In [None]:
import pandas as pd  
import numpy as np

In [None]:
business_chunk_iter = pd.read_json('data/yelp_academic_dataset_business.json', lines=True, chunksize=100000)
df_business = pd.concat(business_chunk_iter)
df_business.head()

In [None]:
review_chunk_iter = pd.read_json('data/yelp_academic_dataset_review.json', lines=True, chunksize=100000)
df_review = pd.concat(review_chunk_iter)
df_review.head()

In [None]:
df_cafes = df_business[
    df_business['categories'].str.contains('Cafes', case=False, na=False)
]

df_cafes = df_review[df_review['business_id'].isin(df_cafes['business_id'])] \
    .merge(df_cafes[['business_id', 'name', 'address', 'city', 'state']], on='business_id') \
    .rename(columns={'review_id': 'id'})

df_cafes['address'] = df_cafes['address'] + ', ' + df_cafes['city'] + ', ' + df_cafes['state']
df_cafes = df_cafes[['id', 'text', 'name', 'address']]

df_cafes.head()

In [None]:
df_sampled = df_cafes.sample(n=1000, random_state=22)
df_sampled.head()

In [None]:
df_sampled.to_json('data/cafes.json', orient='records', lines=True)