In [4]:
# prompt: mount my drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# !tar -xzf /content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset.tgz -C /content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset

In [6]:
!tar -xzf /content/drive/MyDrive/recommender_dataset/Yelp/yelp_photos.tgz -C /content

^C


## Yelp Dataset JSON

Each file is composed of a single object type, one JSON-object per-line.

Take a look at some examples to get you started: [https://github.com/Yelp/dataset-examples](https://github.com/Yelp/dataset-examples).

Note: the follow examples contain inline comments, which are technically not valid JSON. This is done here to simplify the documentation and explaining the structure, the JSON files you download will not contain any comments and will be fully valid JSON.

### business.json

Contains business data including location data, attributes, and categories.

```json
{
    // string, 22 character unique string business id
    "business_id": "tnhfDv5Il8EaGSXZGiuQGg",

    // string, the business's name
    "name": "Garaje",

    // string, the full address of the business
    "address": "475 3rd St",

    // string, the city
    "city": "San Francisco",

    // string, 2 character state code, if applicable
    "state": "CA",

    // string, the postal code
    "postal code": "94107",

    // float, latitude
    "latitude": 37.7817529521,

    // float, longitude
    "longitude": -122.39612197,

    // float, star rating, rounded to half-stars
    "stars": 4.5,

    // integer, number of reviews
    "review_count": 1198,

    // integer, 0 or 1 for closed or open, respectively
    "is_open": 1,

    // object, business attributes to values. note: some attribute values might be objects
    "attributes": {
        "RestaurantsTakeOut": true,
        "BusinessParking": {
            "garage": false,
            "street": true,
            "validated": false,
            "lot": false,
            "valet": false
        },
    },

    // an array of strings of business categories
    "categories": [
        "Mexican",
        "Burgers",
        "Gastropubs"
    ],

    // an object of key day to value hours, hours are using a 24hr clock
    "hours": {
        "Monday": "10:00-21:00",
        "Tuesday": "10:00-21:00",
        "Friday": "10:00-21:00",
        "Wednesday": "10:00-21:00",
        "Thursday": "10:00-21:00",
        "Sunday": "11:00-18:00",
        "Saturday": "10:00-21:00"
    }
}
```

### review.json

Contains full review text data including the user_id that wrote the review and the business_id the review is written for.

```json
{
    // string, 22 character unique review id
    "review_id": "zdSx_SD6obEhz9VrW9uAWA",

    // string, 22 character unique user id, maps to the user in user.json
    "user_id": "Ha3iJu77CxlrFm-vQRs_8g",

    // string, 22 character business id, maps to business in business.json
    "business_id": "tnhfDv5Il8EaGSXZGiuQGg",

    // integer, star rating
    "stars": 4,

    // string, date formatted YYYY-MM-DD
    "date": "2016-03-09",

    // string, the review itself
    "text": "Great place to hang out after work: the prices are decent, and the ambience is fun. It's a bit loud, but very lively. The staff is friendly, and the food is good. They have a good selection of drinks.",

    // integer, number of useful votes received
    "useful": 0,

    // integer, number of funny votes received
    "funny": 0,

    // integer, number of cool votes received
    "cool": 0
}
```

### user.json

User data including the user's friend mapping and all the metadata associated with the user.

```json
{
    // string, 22 character unique user id, maps to the user in user.json
    "user_id": "Ha3iJu77CxlrFm-vQRs_8g",

    // string, the user's first name
    "name": "Sebastien",

    // integer, the number of reviews they've written
    "review_count": 56,

    // string, when the user joined Yelp, formatted like YYYY-MM-DD
    "yelping_since": "2011-01-01",

    // array of strings, an array of the user's friend as user_ids
    "friends": [
        "wqoXYLWmpkEH0YvTmHBsJQ",
        "KUXLLiJGrjtSsapmxmpvTA",
        "6e9rJKQC3n0RSKyHLViL-Q"
    ],

    // integer, number of useful votes sent by the user
    "useful": 21,

    // integer, number of funny votes sent by the user
    "funny": 88,

    // integer, number of cool votes sent by the user
    "cool": 15,

    // integer, number of fans the user has
    "fans": 1032,

    // array of integers, the years the user was elite
    "elite": [
        2012,
        2013
    ],

    // float, average rating of all reviews
    "average_stars": 4.31,

    // integer, number of hot compliments received by the user
    "compliment_hot": 339,

    // integer, number of more compliments received by the user
    "compliment_more": 668,

    // integer, number of profile compliments received by the user
    "compliment_profile": 42,

    // integer, number of cute compliments received by the user
    "compliment_cute": 62,

    // integer, number of list compliments received by the user
    "compliment_list": 37,

    // integer, number of note compliments received by the user
    "compliment_note": 356,

    // integer, number of plain compliments received by the user
    "compliment_plain": 68,

    // integer, number of cool compliments received by the user
    "compliment_cool": 91,

    // integer, number of funny compliments received by the user
    "compliment_funny": 99,

    // integer, number of writer compliments received by the user
    "compliment_writer": 95,

    // integer, number of photo compliments received by the user
    "compliment_photos": 50
}
```

### checkin.json

Checkins on a business.

```json
{
	// string, 22 character business id, maps to business in business.json
	"business_id": "tnhfDv5Il8EaGSXZGiuQGg",
	// string which is a comma-separated list of timestamps for each checkin, each with format YYYY-MM-DD HH:MM:SS
	"date": "2016-04-26 19:49:16, 2016-08-30 18:36:57, 2016-10-15 02:45:18"
}
```

### tip.json

Tips written by a user on a business. Tips are shorter than reviews and tend to convey quick suggestions.

```json
{
    // string, text of the tip
    "text": "Secret menu - fried chicken sando is da bombbbbbb Their zapatos are good too.",

    // string, when the tip was written, formatted like YYYY-MM-DD
    "date": "2013-09-20",

    // integer, how many compliments it has
    "compliment_count": 172,

    // string, 22 character business id, maps to business in business.json
    "business_id": "tnhfDv5Il8EaGSXZGiuQGg",

    // string, 22 character unique user id, maps to the user in user.json
    "user_id": "49JhAJh8vSQ-vM4Aourl0g"
}
```

### photo.json

Contains photo data including the caption and classification (one of "food", "drink", "menu", "inside" or "outside").

```json
{
    // string, 22 character unique photo id
    "photo_id": "_nN_DhLXkfwEkwPNxne9hw",
    // string, 22 character business id, maps to business in business.json
    "business_id" : "tnhfDv5Il8EaGSXZGiuQGg",
    // string, the photo caption, if any
    "caption" : "carne asada fries",
    // string, the category the photo belongs to, if any
    "label" : "food"
}
```


In [2]:
# prompt: Import the appropriate Python libraries to do a thorough exploratory data analysis (EDA) and visualization (no ML yet) on the Yelp Dataset (businesses and reviews data spread across several json files). We should be able to access JSON files, convert some data to pandas dataframes and do some basic preprocessing and plotting

import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt


In [9]:
# prompt: Open "/content/yelp_dataset/yelp_academic_dataset_business.json", convert its content to a pd dataframe and do some basic EDA on it

df_businesses = pd.read_json('/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_academic_dataset_business.json', lines=True)

df_businesses.info()

df_businesses.describe()

df_businesses.head()

df_businesses.tail()

df_businesses.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

In [10]:
# prompt: check for missing values

df_businesses.isnull().sum()


business_id         0
name                0
address             0
city                0
state               0
postal_code         0
latitude            0
longitude           0
stars               0
review_count        0
is_open             0
attributes      13744
categories        103
hours           23223
dtype: int64

In [11]:
# prompt: What are the contents of the 'categories' column ?

df_businesses['categories'].unique()


array(['Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists',
       'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services',
       'Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores',
       ...,
       'Shopping, Jewelry, Piercing, Toy Stores, Beauty & Spas, Accessories, Fashion',
       'Fitness/Exercise Equipment, Eyewear & Opticians, Shopping, Sporting Goods, Bikes',
       'Beauty & Spas, Permanent Makeup, Piercing, Tattoo'], dtype=object)

In [12]:
# prompt: Create a 'df_restaurants' dataframe containing only businesses whose category is exactly equal to the string "Restaurants" and do some basic EDA on that dataframe

df_restaurants = df_businesses[df_businesses['categories'] == 'Restaurants']

df_restaurants.info()

df_restaurants.describe()

df_restaurants.head()

df_restaurants.tail()

df_restaurants.columns

df_restaurants.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 114 entries, 3655 to 147109
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   114 non-null    object 
 1   name          114 non-null    object 
 2   address       114 non-null    object 
 3   city          114 non-null    object 
 4   state         114 non-null    object 
 5   postal_code   114 non-null    object 
 6   latitude      114 non-null    float64
 7   longitude     114 non-null    float64
 8   stars         114 non-null    float64
 9   review_count  114 non-null    int64  
 10  is_open       114 non-null    int64  
 11  attributes    109 non-null    object 
 12  categories    114 non-null    object 
 13  hours         11 non-null     object 
dtypes: float64(3), int64(2), object(9)
memory usage: 13.4+ KB


business_id       0
name              0
address           0
city              0
state             0
postal_code       0
latitude          0
longitude         0
stars             0
review_count      0
is_open           0
attributes        5
categories        0
hours           103
dtype: int64

In [13]:
# prompt: Save the new dataframe for later use (since our subsequent data analysis will only focus on restaurants)

df_restaurants.to_csv('/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_restaurants.csv', index=False)


In [5]:
# prompt: create df_restaurants from the csv file at that location

df_restaurants = pd.read_csv('/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_restaurants.csv')


## Yelp Dataset Photos


### photo.json

Duplicate of photo.json from the [main dataset](/dataset/documentation/main).

```json
{
  // string, 22 character unique photo id
  "photo_id": "_nN_DhLXkfwEkwPNxne9hw",

  // string, 22 character business id, maps to business in business.json
  "business_id": "tnhfDv5Il8EaGSXZGiuQGg",

  // string, the photo caption, if any
  "caption": "carne asada fries",

  // string, the category the photo belongs to, if any
  "label": "food"
}
```

### photos/

Directory containing the photos themselves as "{photo_id}.jpg".

In [16]:
# prompt: Create a dataframe from "photos.json" at  "/content/photos.json" and do some basic EDA on it

df_photos = pd.read_json('/content/photos.json', lines=True)

df_photos.info()

df_photos.describe()

df_photos.head()

df_photos.tail()

df_photos.columns

df_photos.isnull().sum()


df_photos['caption'].unique()


df_photos['label'].unique()


df_photos.to_csv('/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_photos.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200100 entries, 0 to 200099
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   photo_id     200100 non-null  object
 1   business_id  200100 non-null  object
 2   caption      200100 non-null  object
 3   label        200100 non-null  object
dtypes: object(4)
memory usage: 6.1+ MB


In [14]:
# prompt: Write a script that creates df_restaurants from "/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_restaurants.csv" if the dataframe doesn't already exist

if not 'df_restaurants' in globals():
    df_restaurants = pd.read_csv('/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_restaurants.csv')


In [9]:
# prompt: Write a script that creates df_photos from "/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_photos.csv" if the dataframe doesn't already exist

if not 'df_photos' in globals():
  df_photos = pd.read_csv('/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_photos.csv', low_memory=False)


ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [339]:
# prompt: Display the photos associated with the 10 first restaurants in that dataframe captioned by their respective names (use ipywidgets' image components to achieve that). Each photo is stored at "/content/photos/{photo_id}.jpg". We'll also make a join of df_restaurants and df_photos to access restaurant's names. Use image at "/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/imgs/not_found.jpg" when photo_id is NaN

from ipywidgets import Image, HBox, VBox, Layout, Text
import numpy as np


# Merge restaurants and photos

restaurants_photos = df_restaurants.merge(df_photos, how='left', on='business_id')

# Display images
def restaurant_img(restaurant, img_w=100, img_h=100):
    if restaurant['photo_id'] is not np.nan:
        img_path = '/content/photos/' + restaurant['photo_id'] + '.jpg'
    else:
        img_path = '/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/imgs/not_found.jpg'

    with open(img_path, "rb") as f:
            image_bytes = f.read()
            img_widget = Image(value=image_bytes,
                    format='png',
                    )
            img_widget.layout.width = f"{img_w}px"
            img_widget.layout.height = f"{img_h}px"
            return img_widget

for restaurant_index, restaurant in restaurants_photos.head(5).iterrows():
    img = restaurant_img(restaurant)

    vbox = VBox([img, Text(restaurant['name'])])
    display(vbox)


VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00\…

VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00\…

VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff\xdb\x00C\x00\n…

VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff\xdb\x00C\x00\n…

VBox(children=(Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C…

In [42]:
# prompt: Save the dataframe "restaurants_photos" at the usual location

restaurants_photos.to_csv('/content/drive/MyDrive/recommender_dataset/Yelp/yelp_dataset/yelp_restaurants_photos.csv', index=False)


In [44]:
# prompt: Count proportion of missing photos in 'restaurants_photos'

restaurants_photos['photo_id'].isnull().sum() / restaurants_photos.shape[0]


0.8225806451612904

Most restaurant photos are missing from the dataset. Fortunately, images won't be necessary to train our recommender system

In [343]:
# prompt: Copy photos whose id appear in restaurants_photos (and aren't NaN) from "content/photos/{photo_id}.jpg" to "/content/drive/MyDrive/recommender_dataset/Yelp/photos/{photo_id}.jpg" in a parallel and asynchronous fashion

import concurrent.futures as cf
import os

def copy_file(photo_id):
  src_path = os.path.join('/content/photos', f'{photo_id}.jpg')
  dst_path = os.path.join('/content/drive/MyDrive/recommender_dataset/Yelp/photos', f'{photo_id}.jpg')
  os.system(f'cp {src_path} {dst_path}')

with cf.ThreadPoolExecutor() as executor:
  photo_ids = restaurants_photos['photo_id'][restaurants_photos['photo_id'].notnull()].values
  executor.map(copy_file, photo_ids)


In [None]:
# !pip install ipyleaflet bqplot

## Simple star rating component:

In [310]:
def star_rating_static(value=0, num_reviews=0):
  rating_html = ''.join([f"""<span style="color: {'#f90' if i < value else '#ccc'};">&#9733</span>""" for i in range(5)])
  return HTML(f"<h2 style='width: inherit; overflow-wrap: break-word;'>{rating_html} ({num_reviews} reviews)</h2>")

In [197]:
# from ipywidgets import HTML
# import uuid  # For generating unique IDs in order to scope the styles and scripts

# def star_rating(value=0, readonly=False):
#   """Creates a parameterized HTML template string for a star rating widget."""

#   value = int(value)

#   unique_id = str(uuid.uuid4())  # Generate a unique ID for this component

#   stars = ""
#   for i in range(5):
#     stars += f"""
#     <input type="radio" id="{unique_id}_{5 - i}-stars" name="{unique_id}_rating" value="{5 - i}" />
#     <label for="{unique_id}_{5 - i}-stars" class="star">&#9733;</label>
#     """

#   script_lines = [
#   f"document.getElementById('{unique_id}_{value}-stars').click();"
#   ]
#   if readonly:
#     script_lines.append(f"document.getElementById('{unique_id}_{value}-stars').setAttribute('disabled', true);")

#   script = "\n".join(script_lines)

#   return HTML(f"""
#   <style>
#   .star-rating-{unique_id} {{  /* Scope the CSS styles */
#     border: solid 1px #ccc;
#     display: flex;
#     flex-direction: row-reverse;
#     font-size: 1.5em;
#     justify-content: space-around;
#     padding: 0 .2em;
#     text-align: center;
#     width: 5em;
#   }}

#   .star-rating-{unique_id} input:checked ~ label {{  /* Target checked stars within the scope */
#     color: #f90;
#   }}

#   .star-rating-{unique_id} input {{
#     display: none;
#   }}

#   .star-rating-{unique_id} label {{
#     color: #ccc;
#     cursor: pointer;
#   }}

#   .star-rating-{unique_id} label:hover,
#   .star-rating-{unique_id} label:hover ~ label {{
#     color: #fc0;
#   }}
#   </style>

#   <div class="star-rating-{unique_id}">
#   {stars}
#   </div>
#   <script>
#   {script}
#   </script>
#   """)


In [175]:
restaurants_photos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124 entries, 0 to 123
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   124 non-null    object 
 1   name          124 non-null    object 
 2   address       124 non-null    object 
 3   city          124 non-null    object 
 4   state         124 non-null    object 
 5   postal_code   124 non-null    object 
 6   latitude      124 non-null    float64
 7   longitude     124 non-null    float64
 8   stars         124 non-null    float64
 9   review_count  124 non-null    int64  
 10  is_open       124 non-null    int64  
 11  attributes    119 non-null    object 
 12  categories    124 non-null    object 
 13  hours         19 non-null     object 
 14  photo_id      22 non-null     object 
 15  caption       22 non-null     object 
 16  label         22 non-null     object 
dtypes: float64(3), int64(2), object(12)
memory usage: 17.4+ KB


In [342]:
# prompt: use ipyleaflet to display the restaurants on a map

from ipywidgets import HTML, Layout, TwoByTwoLayout, GridspecLayout

from ipyleaflet import Map, Marker, Popup, LayerGroup, SearchControl, ScaleControl, AwesomeIcon

def get_restaurant_popup(restaurant, img_w=125, img_h=200):
  """Returns an HTML string template for a popup displaying the most relevant attributes of a restaurant."""

  thumbnail = restaurant_img(restaurant, img_w=img_w, img_h=img_h)
  description = HTML(f"""
  <div style="width: 125px; overflow-wrap: break-word;">
    <h2>{restaurant['name']}</h2>
    <br>
    <b>{restaurant['address']}, {restaurant['city']}</b>
    <br>
     {'(Open)' if restaurant['is_open'] else '(Closed)'}
  </div>
  """)
  rating = star_rating_static(value=restaurant['stars'], num_reviews=restaurant['review_count'])

  grid = GridspecLayout(6, 6, width='500px',height='300px')
  grid[:3, 2:] = description
  grid[:3, :2] = thumbnail
  grid[4, :] = rating

  return grid

# Create a map
m = Map(center=(39.99395569397331, -75.13861180903687), zoom=8, layout=Layout(width='100%', height='600px'))

marker = Marker(icon=AwesomeIcon(name="check", marker_color='green', icon_color='darkgreen'))

m.add(SearchControl(
  position="topleft",
  url='https://nominatim.openstreetmap.org/search?format=json&q={s}',
  zoom=5,
  marker=marker
))

m.add(ScaleControl(position='bottomleft'))


# Add a layer for the restaurants
restaurants_layer = LayerGroup()
for restaurant in restaurants_photos.head(10).iterrows():
  restaurant_location = (restaurant[1]['latitude'], restaurant[1]['longitude'])
  restaurant_marker = Marker(location=restaurant_location, draggable=False)
  restaurant_marker.popup = get_restaurant_popup(restaurant[1], img_w=200)

  restaurants_layer.add_layer(restaurant_marker)

# Add the layer to the map
m.add_layer(restaurants_layer)

# Display the map
display(m)


Map(center=[39.99395569397331, -75.13861180903687], controls=(ZoomControl(options=['position', 'zoom_in_text',…

In [52]:
# prompt: For the futur purpose of displaying popups on the map, create an HTML string template for a popup displaying the most relevant attributes of each restaurant from 'restaurants_photos'




9.0