## Unsupervised Experiments on YELP dataset

**Disclaimer**

This project is an independent academic endeavor and has not been endorsed, sponsored, or otherwise authorized by Yelp Inc. It was created for educational purposes under the terms of Yelp's Data Agreement and does not reflect the views or opinions of Yelp. All data from Yelp used in this project is the property of Yelp Inc., and this project's findings and conclusions are solely those of the author(s). Yelp® and its related marks are trademarks of Yelp.

In [36]:
import json
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import folium

from mpl_toolkits import basemap

The following couple of functions downloads the YELP dataset from Kaggle and read the json files to create Pandas DataFrames

In [ ]:
def download_and_extract_yelp_dataset(destination_dir='data'):
    """
    Downloads and extracts the Yelp dataset from Kaggle into the specified directory.
    
    Args:
    destination_dir (str): The directory to download and extract the dataset to. Defaults to 'data'.
    """
    
    api = KaggleApi()
    api.authenticate()

    dataset = 'yelp-dataset/yelp-dataset'

    zip_path = os.path.join(destination_dir, 'yelp-dataset.zip')

    # Create the directory if it does not exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    # Download and unzip the dataset
    print(f"Downloading Yelp dataset to {zip_path}...")
    api.dataset_download_files(dataset, path=destination_dir, unzip=False)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        print(f"Extracting Yelp dataset to {destination_dir}...")
        zip_ref.extractall(destination_dir)

    # Remove the zip file
    os.remove(zip_path)
    print("Download and extraction complete.")

download_and_extract_yelp_dataset()

In [2]:
def read_json_files(loc='data'):
    """
    Reads the json files in the specified location and converts them into a pandas dataframe.
    :param loc: str: The location to read the json files from
    :return: dict: A dictionary containing all the dataframes as values and their name as keys
    """
    dataset = {}
    for file in os.listdir(loc):
        print('reading file: ', file)
        try:
            data = []
            
            # Read the file and store each json entry into the data list
            with open(os.path.join(loc, file), 'r') as f:
                for line in f:
                    data.append(json.loads(line))
                    
            # Extract name of the dataset and create a dataframe from the list we just created
            name = file.split('.')[0].split('_')[-1]
            dataset[name] = pd.DataFrame(data)
        
        # Skip invalid files
        except json.JSONDecodeError as err:
            print('skipping {}'.format(file))
            continue
        except UnicodeDecodeError as err:
            print('skipping {}'.format(file))
            continue
            
    return dataset

datasets = read_json_files()

reading file:  yelp_academic_dataset_review.json
reading file:  yelp_academic_dataset_business.json
reading file:  yelp_academic_dataset_user.json
reading file:  Dataset_User_Agreement.pdf
skipping Dataset_User_Agreement.pdf
reading file:  yelp_academic_dataset_checkin.json
reading file:  yelp_academic_dataset_tip.json


In [4]:
datasets.keys()

dict_keys(['review', 'business', 'user', 'checkin', 'tip'])

In [26]:
business = datasets['business']
business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
