In [1]:
# Import dependencies
import pandas as pd
import re
import time
import pymongo
from bs4 import BeautifulSoup
from requests import get
from random import randint
from pymongo import MongoClient

In [2]:
# --- EXTRACT ---

# Set 'restaurant_list' to store information scraped from source website
restaurant_list = []

# Set a for loop to go through the Yelp website from page 1 through 10 using BeautifulSoup to extract HTML code and pull information related to Resturant Name, Cuisine Style, Price, and Rating.
url = "https://www.yelp.com/search?find_desc=Restaurants&find_loc=West%20Hollywood%2C%20CA&start="
for page in range(0,110,10):
    response = get(url+str(page))
    html = response.text
    soup = BeautifulSoup(html,"html.parser")

    rl = re.compile("list*")
    restaurant_ul = soup.find_all('ul', class_=rl)
    for r_ul in restaurant_ul:
        border = re.compile("border*")
        restaurant_li = r_ul.find_all('li', class_=border)

        for restaurant in restaurant_li:
            container = re.compile("container*")
            aux = restaurant.find_all('div', class_=container)

            if len(aux) == 3:
                restaurant_dict = {}
                
                name = re.compile("businessNam*")
                restaurant_name = restaurant.find_all('div', class_=name)
                try:
                    restaurant_dict['name'] = restaurant_name[0].find('a').text     
                except:
                    restaurant_dict['name'] = None

                stars = re.compile("i-stars*")
                restaurant_stars = restaurant.find_all('div', class_=stars)
                try:
                    restaurant_dict['stars'] = restaurant_stars[0]["aria-label"]
                except:
                    restaurant_dict['stars'] = None
                    
                price = re.compile("priceRange*")
                restaurant_price = restaurant.find_all('span', class_=price)
                try:
                    restaurant_dict['price'] = restaurant_price[0].text
                except:
                    restaurant_dict['price'] = None
                    
                
                cuisine = re.compile("text__09f24__2NHRu css-1hx6l2b*")
                restaurant_cuisine = restaurant.find_all('p', class_=cuisine)
                try:
                    restaurant_dict['cuisine'] = restaurant_cuisine[0].text
                except:
                    restaurant_dict['cuisine'] = None
                    
                restaurant_list.append(restaurant_dict)
                    
    time.sleep(randint(1,5))

# Displays the extracted list
restaurant_list

[{'name': 'GRANVILLE',
  'stars': '4.5 star rating',
  'price': '$$',
  'cuisine': 'American (New)'},
 {'name': 'Soulmate.',
  'stars': '4 star rating',
  'price': '$$$',
  'cuisine': 'Mediterranean'},
 {'name': 'The Front Yard',
  'stars': '4 star rating',
  'price': '$$',
  'cuisine': 'Tapas/Small Plates'},
 {'name': 'Tu Madre - West Hollywood',
  'stars': '4 star rating',
  'price': '$$',
  'cuisine': 'Mexican'},
 {'name': 'Norah',
  'stars': '4 star rating',
  'price': '$$',
  'cuisine': 'American (New)'},
 {'name': 'Employees Only',
  'stars': '4 star rating',
  'price': '$$$',
  'cuisine': 'Cocktail Bars'},
 {'name': 'OSTE',
  'stars': '5 star rating',
  'price': None,
  'cuisine': 'Italian'},
 {'name': 'The Butcher, The Baker, The Cappuccino Maker',
  'stars': '4 star rating',
  'price': '$$',
  'cuisine': 'Breakfast & Brunch'},
 {'name': 'Republique',
  'stars': '4 star rating',
  'price': '$$',
  'cuisine': 'French'},
 {'name': 'Conservatory',
  'stars': '4 star rating',
  'pr

In [3]:
# ---TRANSFORM ---

# Create a dataframe from the extracted list using Pandas 
restaurant_df = pd.DataFrame(restaurant_list)
restaurant_df

Unnamed: 0,name,stars,price,cuisine
0,GRANVILLE,4.5 star rating,$$,American (New)
1,Soulmate.,4 star rating,$$$,Mediterranean
2,The Front Yard,4 star rating,$$,Tapas/Small Plates
3,Tu Madre - West Hollywood,4 star rating,$$,Mexican
4,Norah,4 star rating,$$,American (New)
...,...,...,...,...
105,Emilia,4.5 star rating,,Italian
106,Uovo- Mid-Wilshire,4.5 star rating,$$,Italian
107,Beauty & Essex,4 star rating,$$$,Lounges
108,Mama Shelter Rooftop Bar,3.5 star rating,$$,American (New)


In [4]:
# Convert the names in the data frame
result_df = restaurant_df[["name","stars","price","cuisine"]]

# Drop 'star rating' from the rating column to be able to manipulate the data
rating = result_df["stars"].replace("star rating","", regex=True)
result_df["star rating"] = rating

# Replace the '$' with integers to manipulate the data
price_1 = result_df["price"].replace("$",1)
result_df["price"] = price_1

price_2 = result_df["price"].replace("$$", 2)
result_df["price"] = price_2

price_3 = result_df["price"].replace("$$$",3)
result_df["price"] = price_3

price_4 = result_df["price"].replace("$$$$",4)
result_df["price"] = price_4

# Convert the rating column to float to manipulate data
result_df['rating'] = result_df['star rating'].astype(float)

# Create a new dataframe with the cleaned data
result_clean_df = result_df[["name", "cuisine", "price", "rating"]]

# Store cleaned dataframe as a csv for visualisations and analysis
result_clean_df.to_csv("Resources/result_clean.csv")

# Display play cleaned dataframe
result_clean_df

Unnamed: 0,name,cuisine,price,rating
0,GRANVILLE,American (New),2.0,4.5
1,Soulmate.,Mediterranean,3.0,4.0
2,The Front Yard,Tapas/Small Plates,2.0,4.0
3,Tu Madre - West Hollywood,Mexican,2.0,4.0
4,Norah,American (New),2.0,4.0
...,...,...,...,...
105,Emilia,Italian,,4.5
106,Uovo- Mid-Wilshire,Italian,2.0,4.5
107,Beauty & Essex,Lounges,3.0,4.0
108,Mama Shelter Rooftop Bar,American (New),2.0,3.5


In [5]:
# --- LOAD ---

# Making a connection with MongoClient
client = MongoClient("mongodb://localhost:27017/")

# Create Database
db = client["YELP_Worth_It"]

# Create Collection
collection = db["CSV Data"]

# Convert dataframe into dictionary
results_dict = result_clean_df.to_dict('index')
results_dict

{0: {'name': 'GRANVILLE',
  'cuisine': 'American (New)',
  'price': 2.0,
  'rating': 4.5},
 1: {'name': 'Soulmate.',
  'cuisine': 'Mediterranean',
  'price': 3.0,
  'rating': 4.0},
 2: {'name': 'The Front Yard',
  'cuisine': 'Tapas/Small Plates',
  'price': 2.0,
  'rating': 4.0},
 3: {'name': 'Tu Madre - West Hollywood',
  'cuisine': 'Mexican',
  'price': 2.0,
  'rating': 4.0},
 4: {'name': 'Norah',
  'cuisine': 'American (New)',
  'price': 2.0,
  'rating': 4.0},
 5: {'name': 'Employees Only',
  'cuisine': 'Cocktail Bars',
  'price': 3.0,
  'rating': 4.0},
 6: {'name': 'OSTE', 'cuisine': 'Italian', 'price': nan, 'rating': 5.0},
 7: {'name': 'The Butcher, The Baker, The Cappuccino Maker',
  'cuisine': 'Breakfast & Brunch',
  'price': 2.0,
  'rating': 4.0},
 8: {'name': 'Republique', 'cuisine': 'French', 'price': 2.0, 'rating': 4.0},
 9: {'name': 'Conservatory',
  'cuisine': 'American (New)',
  'price': 3.0,
  'rating': 4.0},
 10: {'name': 'Rosaline', 'cuisine': 'Peruvian', 'price': 3.0,

In [6]:
# Insert data into MongoDB
for result in results_dict.values():
    collection.insert_one(result)