In [None]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from operator import itemgetter
from pprint import pprint
#import unidecode
import requests

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy import distance

# 1) Create scraping functions

### The below codes are to make 2 dataframes for our Data Analytics submission
#### The first function scrape restaurant features such as user rating, number of ratings, address and a few keywords specified on netpincer
#### The second function collects information a restaurant's products and their corresponding prices as advertised on the site

In [None]:
link_base = 'https://www.netpincer.hu/'


#create empty lists+dataframes to enrich with user defined functions
link_list=[]
product_df = pd.DataFrame(columns = ['Restaurant', 'Product_Name', 'Price']) 
restaurant_df = pd.DataFrame(columns = ['Restaurant', 'User-Rating',  'No-Ratings', 'Address', 'Feature1'
                                       , 'Feature2','Feature3', 'Feature4', 'Feature5']) 

In [None]:
#Parse netpincer to get list of html links for restaurants available at BASE_URL

def generate_restaurant_links(URL):
    global link_base
    global link_list
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, "html.parser") 
    restaurants = soup.find('ul', {'class':'vendor-list'}).findAll('a')
    link_list.extend([link_base+link.get('href') for link in restaurants])

In [None]:
def restaurant_scraper(URL):
    
    global restaurant_df
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, "html.parser") 

    #Getting name of the restaurant
    restaurant = soup.find('div',{'class':'vendor-info-main-headline item'}).getText().replace('\n','')

    #Getting features of restaurant
    features = soup.find('ul',{'class':'vendor-info-main-details-cuisines'}).findAll('li')
    features_list = [values.getText() for values in features[1:]]
    #Adding nan values if feature is missing - the max amount of restaurant features are 5
    features_list.extend((5- len(features_list)) * [np.nan])

    #Getting ratings of restaurants if available - error handling was needed and to replace values with NaNs 
    try:
        ratings = soup.find('div',{'class':'ratings-component'}).findAll('span')
        ratings_list = [values.getText().split()for values in ratings[1:]]
            #Splitting out user rating out of how many
        ratings_list = [nums[0].split('/') for nums in ratings_list]
            #Breaking out lists in list
        ratings_list = [[item] for sub_list in ratings_list for item in sub_list]
    except:
        ratings_list = [[np.NaN],[np.NaN],[np.NaN]]

    #Getting the address of a Restaurant
    address = soup.find('p',{'class':'vendor-location'}).getText()

    #Putting all of the above in a dictionary
    dict_for_restaurant_df = {'Restaurant': restaurant, 'User-Rating':ratings_list[0][0],  'No-Ratings': ratings_list[2][0]
                              , 'Address' : address, 'Feature1': features_list[0], 'Feature2': features_list[1]
                              ,'Feature3': features_list[2], 'Feature4' : features_list[3],'Feature5': features_list[4]}

    #Add to restaurant_df
    restaurant_df = restaurant_df.append(dict_for_restaurant_df, ignore_index = True)

In [None]:
def product_scraper (URL): 
    global product_df
    prices = []
    
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, "html.parser") 

    #Getting name of the restaurant
    restaurant = soup.find('div',{'class':'vendor-info-main-headline item'}).getText().replace('\n','')
    
    #Getting name of product and prices - fortunately both are spans
    product_names = soup.find('div', {'class':'menu__items'}).findAll('h3',{'class':'dish-name fn p-name'})
    product_names = [name.getText().replace('\n','') for name in product_names]
    
    #Getting price of a product - certain formattings are required as many unused characters were stored in spans
    prices_temp = soup.find('div', {'class':'menu__items'}).findAll('span',{'class':'price p-price'})
    
    for price in prices_temp:
        price = price.getText()
        price = price.replace("\n","").replace(" ","").replace("innen","").replace("\xa0","").split('Ft', 1)[0]
        prices.append(price)
   
    #Creating restaurant list which should be as long as other lists that we created for products
    restaurant_name = [restaurant for i in range(len(product_names))]

    #Put vectors into pandas dataframe    
    dict_for_df = {'Restaurant': restaurant_name, 'Product_Name': product_names, 'Price': prices}  
    df = pd.DataFrame(dict_for_df)
    
    product_df = product_df.append(df, ignore_index = True)

# 2) Scrape restaurants

### We scrape data and structre them in tidy data tables
####  We first decide on the population of resturants to include - and their netpincer URLs:
##### - Pizzaplaces that deliver to CEU
##### - Pizzaplaces that can deliver to the city centers of the top five Hungarian cities (excluding Budapest): Debrecen, Szeged, Miskolc, Pecs, Gyor
#### Then we run both restaurant and product scrapers

#### 2.1) Let's start with the restaurant scraper

In [None]:
final_restaurant_df = pd.DataFrame(columns = ['Restaurant', 'User-Rating',  'No-Ratings', 'Address', 'Feature1'
                                   , 'Feature2','Feature3', 'Feature4', 'Feature5','City']) 

In [None]:
BP_URL = 'https://www.netpincer.hu/restaurants/new?lat=47.501185&lng=19.049364&vertical=restaurants&cuisines=52'
DB_URL = 'https://www.netpincer.hu/restaurants/new?lat=47.5313352&lng=21.624532&vertical=restaurants&cuisines=52'
SZG_URL = 'https://www.netpincer.hu/restaurants/new?lat=46.254233&lng=20.1493499&vertical=restaurants&cuisines=52'
MS_URL = 'https://www.netpincer.hu/restaurants/new?lat=48.10137599999999&lng=20.7306244&vertical=restaurants&cuisines=52'
PCS_URL = 'https://www.netpincer.hu/restaurants/new?lat=46.07605239999999&lng=18.2282426&vertical=restaurants&cuisines=52'

cities = {'Budapest':BP_URL,'Debrecen':DB_URL,'Szeged':SZG_URL,'Miskolc': MS_URL,'Pécs': PCS_URL}

In [None]:
for city in cities:
    link_list = []
    restaurant_df = pd.DataFrame(columns = ['Restaurant', 'User-Rating',  'No-Ratings', 'Address', 'Feature1'
                                           , 'Feature2','Feature3', 'Feature4', 'Feature5']) 

    generate_restaurant_links(cities[city])
    for link in link_list:
        restaurant_scraper(link)
        print(link+" is ready")

    city_list = len(restaurant_df)*[city]
    restaurant_df["City"] = np.array(city_list)

    final_restaurant_df = final_restaurant_df.append(restaurant_df,ignore_index=True)

In [None]:
#final_restaurant_df.to_csv(r'C:\Users\T450s\Python_directory\all_restaurants.csv', 
#                           index = True, sep=',', encoding='utf-8')

In [None]:
#final_restaurant_df = pd.read_csv(r'C:\Users\T450s\Python_directory\all_restaurants.csv', index_col=0)

In [None]:
#Calculating distance

In [None]:
#Concatenating address + city for geolocator input
dist_restaurants_df = final_restaurant_df
dist_restaurants_df['ProperAddress'] = dist_restaurants_df['City'] + ' ' + dist_restaurants_df['Address']

In [None]:
# Getting city center locations #TODO: store in list
locator = Nominatim(user_agent='ba')
location_bud = locator.geocode('Budapest')
location_deb = locator.geocode('Debrecen')
location_szg = locator.geocode('Szeged')
location_ms = locator.geocode('Miskolc')
location_pcs = locator.geocode('Pécs')

In [None]:
# RateLimiter must be used for Nominatim
geocoder = RateLimiter(Nominatim(user_agent='ba').geocode, min_delay_seconds=1)
dist_restaurants_df['Location'] = (dist_restaurants_df['ProperAddress']).apply(geocoder)

# add restaurant latitude and longitude to dataframe
dist_restaurants_df['RestaurantLatitude'] = dist_restaurants_df['Location'].apply(lambda loc: loc.latitude if loc else None)
dist_restaurants_df['RestaurantLongitude'] = dist_restaurants_df['Location'].apply(lambda loc: loc.longitude if loc else None)

#add city center latitude and logitude to dataframe
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Budapest', 'CenterLatitude'] = location_bud.latitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Budapest', 'CenterLongitude'] = location_bud.longitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Debrecen', 'CenterLatitude'] = location_deb.latitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Debrecen', 'CenterLongitude'] = location_deb.longitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Szeged', 'CenterLatitude'] = location_szg.latitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Szeged', 'CenterLongitude'] = location_szg.longitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Miskolc', 'CenterLatitude'] = location_ms.latitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Miskolc', 'CenterLongitude'] = location_ms.longitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Pécs', 'CenterLatitude'] = location_pcs.latitude
dist_restaurants_df.loc[dist_restaurants_df['City'] == 'Pécs', 'CenterLongitude'] = location_pcs.longitude
#TODO use loop

In [None]:
#calculate distance
#drop rows with no proper location
dist_restaurants_df = dist_restaurants_df[dist_restaurants_df['RestaurantLatitude'].notna()]
dist_restaurants_df = dist_restaurants_df[dist_restaurants_df['CenterLatitude'].notna()]
dist_restaurants_df['distance'] = dist_restaurants_df.apply(lambda row: distance.distance((row.RestaurantLatitude, row.RestaurantLongitude), (row.CenterLatitude, row.CenterLongitude)), axis=1 if distance.distance else None)
dist_restaurants_df

In [None]:
#dist_restaurants_df.to_csv(r'/Users/utassydv/Downloads/all_restaurants_w_dist.csv', 
#                           index = True, sep=',', encoding='utf-8')

### 2.2) And now the product scraper

In [None]:
product_df = pd.DataFrame(columns = ['Restaurant', 'Product_Name', 'Price'])

In [None]:
for city in cities:
    link_list = []
    
    generate_restaurant_links(cities[city])
    for link in link_list:
        product_scraper(link)
        print(link+" is ready")

In [None]:
#product_df.to_csv(r'C:\Users\T450s\Python_directory\all_products.csv', 
#                           index = True, sep=',', encoding='utf-8')