In [1]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from operator import itemgetter
from pprint import pprint
import requests

# 1) Create scraping functions

### The below codes are to make 2 dataframes for our Data Analytics submission
#### The first function scrape restaurant features such as user rating, number of ratings, address and a few keywords specified on netpincer
#### The second function collects information a restaurant's products and their corresponding prices as advertised on the site

In [2]:
link_base = 'https://www.netpincer.hu/'


#create empty lists+dataframes to enrich with user defined functions
link_list=[]
final_df = pd.DataFrame(columns = ['Restaurant', 'Product_Name', 'Price']) 
restaurant_df = pd.DataFrame(columns = ['Restaurant', 'User-Rating',  'No-Ratings', 'Address', 'Feature1'
                                       , 'Feature2','Feature3', 'Feature4', 'Feature5']) 

In [3]:
#Parse netpincer to get list of html links for restaurants available at BASE_URL

def generate_restaurant_links(URL):
    global link_base
    global link_list
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, "html.parser") 
    restaurants = soup.find('ul', {'class':'vendor-list'}).findAll('a')
    link_list.extend([link_base+link.get('href') for link in restaurants])

In [4]:
def restaurant_scraper(URL):
    
    global restaurant_df
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, "html.parser") 

    #Getting name of the restaurant
    restaurant = soup.find('div',{'class':'vendor-info-main-headline item'}).getText()

    #Getting features of restaurant
    features = soup.find('ul',{'class':'vendor-info-main-details-cuisines'}).findAll('li')
    features_list = [values.getText() for values in features[1:]]
    #Adding nan values if feature is missing - the max amount of restaurant features are 5
    features_list.extend((5- len(features_list)) * [np.nan])

    #Getting ratings of restaurants if available - error handling was needed and to replace values with NaNs 
    try:
        ratings = soup.find('div',{'class':'ratings-component'}).findAll('span')
        ratings_list = [values.getText().split()for values in ratings[1:]]
            #Splitting out user rating out of how many
        ratings_list = [nums[0].split('/') for nums in ratings_list]
            #Breaking out lists in list
        ratings_list = [[item] for sub_list in ratings_list for item in sub_list]
    except:
        ratings_list = [[np.NaN],[np.NaN],[np.NaN]]

    #Getting the address of a Restaurant
    address = soup.find('p',{'class':'vendor-location'}).getText()

    #Putting all of the above in a dictionary
    dict_for_restaurant_df = {'Restaurant': restaurant, 'User-Rating':ratings_list[0][0],  'No-Ratings': ratings_list[2][0]
                              , 'Address' : address, 'Feature1': features_list[0], 'Feature2': features_list[1]
                              ,'Feature3': features_list[2], 'Feature4' : features_list[3],'Feature5': features_list[4]}

    #Add to restaurant_df
    restaurant_df = restaurant_df.append(dict_for_restaurant_df, ignore_index = True)

In [5]:
def product_scraper (URL): 
    global final_df
    prices = []
    
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, "html.parser") 

    #Getting name of the restaurant
    restaurant = soup.find('div',{'class':'vendor-info-main-headline item'}).getText()

    #Getting name of product and prices - fortunately both are spans
    smaller_soup = soup.find('div', {'class':'menu__items'}).findAll('span')
    temp_list= [elem.getText() for elem in smaller_soup]

    #Name will be every first, price will be every second element in smaller soup
    product_names = temp_list[::2]
    prices_temp = temp_list[1::2]
    restaurant_name = [restaurant for i in range(len(product_names))]

    #Price requires further alterations
    for elem in prices_temp:
        temp_elem = elem.split()
        prices.append("".join(temp_elem))

    #Put vectors into pandas dataframe    
    dict_for_df = {'Restaurant': restaurant_name, 'Product_Name': product_names, 'Price': prices}  
    df = pd.DataFrame(dict_for_df)
    
    final_df = final_df.append(df)

# 2) Scrape restaurants

### We scrape data and structre them in tidy data tables
####  We first decide on the population of resturants to include - and their netpincer URLs:
##### - Pizzaplaces that deliver to CEU
##### - Pizzaplaces that can deliver to the city centers of the top five Hungarian cities (excluding Budapest): Debrecen, Szeged, Miskolc, Pecs, Gyor
#### Then we run both restaurant and product scrapers

In [23]:
final_restaurant_df = pd.DataFrame(columns = ['Restaurant', 'User-Rating',  'No-Ratings', 'Address', 'Feature1'
                                   , 'Feature2','Feature3', 'Feature4', 'Feature5','City']) 

In [67]:
BP_URL = 'https://www.netpincer.hu/restaurants/new?lat=47.501185&lng=19.049364&vertical=restaurants&cuisines=52'
DB_URL = 'https://www.netpincer.hu/restaurants/new?lat=47.5313352&lng=21.624532&vertical=restaurants&cuisines=52'
SZG_URL = 'https://www.netpincer.hu/restaurants/new?lat=46.254233&lng=20.1493499&vertical=restaurants&cuisines=52'
MS_URL = 'https://www.netpincer.hu/restaurants/new?lat=48.10137599999999&lng=20.7306244&vertical=restaurants&cuisines=52'
PCS_URL = 'https://www.netpincer.hu/restaurants/new?lat=46.07605239999999&lng=18.2282426&vertical=restaurants&cuisines=52'

cities = {'Budapest':BP_URL,'Debrecen':DB_URL,'Szeged':SZG_URL,'Miskolc': MS_URL,'Pécs': PCS_URL}

In [63]:
for city in cities:
    link_list = []
    restaurant_df = pd.DataFrame(columns = ['Restaurant', 'User-Rating',  'No-Ratings', 'Address', 'Feature1'
                                           , 'Feature2','Feature3', 'Feature4', 'Feature5']) 

    generate_restaurant_links(cities[city])
    for link in link_list:
        restaurant_scraper(link)
        print(link+" is ready")

    city_list = len(restaurant_df)*[city]
    restaurant_df["City"] = np.array(city_list)

    final_restaurant_df2 = final_restaurant_df2.append(restaurant_df,ignore_index=True)

https://www.netpincer.hu//restaurant/y4py/salt-and-pepper is ready
https://www.netpincer.hu//restaurant/f8vc/happyhot-pizza-debrecen is ready
https://www.netpincer.hu//restaurant/i1ff/piedone-pizzeria is ready
https://www.netpincer.hu//restaurant/r3ud/casanova-pizza is ready
https://www.netpincer.hu//restaurant/d9un/station-bistro is ready
https://www.netpincer.hu//restaurant/y4br/pomodoro-pizza is ready
https://www.netpincer.hu//restaurant/g2li/bellozzo-debrecen is ready
https://www.netpincer.hu//restaurant/a9rh/pizza-holiday-debrecen is ready
https://www.netpincer.hu//restaurant/g7rq/kiraly-pizza-debrecen is ready
https://www.netpincer.hu//restaurant/t1uf/dirty-dogs is ready
https://www.netpincer.hu//restaurant/c2zn/corleone-ristorante is ready
https://www.netpincer.hu//restaurant/r9qt/pop-art-cafe is ready
https://www.netpincer.hu//restaurant/e1bp/doner-kebab-pizzeria is ready
https://www.netpincer.hu//restaurant/c3qp/manna-etterem is ready
https://www.netpincer.hu//restaurant/a7px/

In [41]:
#final_restaurant_df.to_csv(r'C:\Users\T450s\Python_directory\all_restaurants.csv', 
#                           index = True, sep=',', encoding='utf-8')

In [73]:
#final_restaurant_df = pd.read_csv(r'C:\Users\T450s\Python_directory\all_restaurants.csv', index_col=0)