### Importing modules

In [42]:
import requests
import json
import re
import numpy as np
import pandas as pd
import os
from bs4 import BeautifulSoup
from datetime import datetime
from collections import Counter, OrderedDict

from urllib3.exceptions import InsecureRequestWarning
from urllib3 import disable_warnings
import warnings
disable_warnings(InsecureRequestWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None

### Function used in a project

In [46]:
#function for changing date as a string into date format
def format_date(date_str): 
    date_time_obj = datetime.strptime(date_str, '%b %d, %Y')
    return date_time_obj

### Web scraping

In [1]:
# dataframe init used for collecting data
df_scraped = pd.DataFrame(columns = ['beer_name', 'name of style', 'name_of_substyle', 'beer_country',
                  'beer_state', 'rank_in_style', 'score', 'rank', 'abv [%]', 'avg_score',
                  'deviation_score [%]', 'ratings', 'active', 'date_added', 'who_wants', 'who_have',
                  'brewery_name', 'brewery_city', 'brewery_state', 'brewery_country', 'brewery_map',
                  'brewery_website', 'brewery_type', 'brewery_adress', 'brewery_postal_code',
                  'brewery_telephone', 'brewery_notes'])
i = 0
#obtaining soup for web scraping
web = requests.get('https://www.beeradvocate.com/beer/styles/', verify=False).text
soup = BeautifulSoup(web, 'lxml')

#finding main styles of beers
beers = soup.find_all('div', class_ = 'stylebreak')[9:]
beers_list = [] #list with main styles of beers
list_beer_subs_name_sublists = [] #list of lists with substyles of beers
 
for beer in beers:
    beer_s_name = beer.find('b').text #beer style name
    beers_list.append(beer_s_name)
    
    #looking for a class with beer substyle
    beer_groups = beer.find_all('li')
    beer_subs_name_sublists = [] #list of beer substyle names
    list_beer_subs_name_sublists.append(beer_subs_name_sublists)
    beer_subs_link_sublists = [] #list of beer substyle links
    
    for beer_group in beer_groups: #loop through substyles
        beer_subs_name = beer_group.text #name of substyle
        beer_subs_name_sublists.append(beer_subs_name)
        beer_subs_link = 'https://www.beeradvocate.com' + beer_group.a['href'] #substyle link for next step of scraping
        beer_subs_link_sublists.append(beer_subs_link)
        
        #finding first page of beers in subclass and rest links for scraping all beers (0-50, 5-100 etc)
        subs_web = requests.get(beer_subs_link, verify=False).text
        subs_soup = BeautifulSoup(subs_web, 'lxml')
        subs_number = subs_soup.find_all('tr')[0].text.split(" ")[-1].replace(')',"")
        if int(subs_number) > 50:
            subs_last_page_record = subs_soup.find_all('tr')[1]
            subs_last_page = subs_last_page_record.find_all('a', href=True)[-1] #link to the last page
            subs_last_page_link = subs_last_page['href']
            subs_last_page_number = subs_last_page_link.split("=")[-1] #looking for last number of page
            subs_sub_pages_links = [beer_subs_link + "?sort=revsD&start=" + str(number) for number in range(0, int(subs_last_page_number) + 50, 50)]
        elif int(subs_number) <= 50:
            subs_sub_pages_links = [beer_subs_link + "?sort=revsD&start=0"]
        for sub_page in subs_sub_pages_links: #loop through all subpages for each substyles of beer
            subs_sub_page_web = requests.get(sub_page, verify=False).text #get a specific beer and brewery from substyle
            subs_sub_page_soup = BeautifulSoup(subs_sub_page_web, 'lxml')
            subs_records = subs_sub_page_soup.find_all('tr')[3:-2]

            for sub_record in subs_records: #loop through records in a first column of substyle  
                subs_lcol = sub_record.find('td', class_ = 'hr_bottom_light') 
                subs_lcol_items = subs_lcol.find_all('a', href=True) #obtained data from first columns
                subs_names = [item.text for item in subs_lcol_items]
                subs_links = ['https://www.beeradvocate.com' + item['href'] for item in subs_lcol_items]
                subs_beers_name, subs_brews_name = [subs_names[i] for i, x in enumerate(subs_names)]
                subs_beers_link, subs_brews_link = [subs_links[i] for i, x in enumerate(subs_links)]

                #getting soup for certain beer on website
                beer_web = requests.get(subs_beers_link, verify=False).text
                beer_soup = BeautifulSoup(beer_web, 'lxml')
                beer = beer_soup.find('dl', class_ = 'beerstats')
                beer_params = beer.find_all('dd', class_ = 'beerstats')
                beer_params_text = [item.text.strip() for item in beer_params]
                beer_params_text.pop(8)
                beer_brewery_name, beer_country, beer_style_rank_glued, beer_abv, beer_score_rank_glued, beer_avg_dev_glued, beer_ratings, beer_active, beer_added, beer_wants, beer_gots = [beer_params_text[i] for i, x in enumerate(beer_params_text)]

                if 'Ranked #' in beer_style_rank_glued:
                    beer_style = beer_style_rank_glued.split('Ranked #')[0] #Style of beer
                    beer_rank_in_style = beer_style_rank_glued.split('Ranked #')[1] #Rank of beer in his style
                else:
                    beer_style = beer_style_rank_glued
                    beer_rank_in_style = np.nan
                if 'Ranked #'in beer_score_rank_glued:
                    beer_score = beer_score_rank_glued.split('Ranked #')[0] #Score on the website
                    beer_rank = beer_score_rank_glued.split('Ranked #')[1] #Position in the ranking
                elif "Needs more ratings" in beer_score_rank_glued:
                    beer_score = np.nan
                    beer_rank = np.nan
                else: 
                    beer_score = beer_score_rank_glued
                    beer_rank = np.nan
                if ',' in beer_country:
                    beer_country_split = beer_country.split(',')
                    beer_country = beer_country_split[-1].strip()
                    beer_state = beer_country_split[0].strip()
                else:
                    beer_state = np.nan
                beer_abv = beer_abv[:-1] #Voltage
                beer_avg = beer_avg_dev_glued.split('| pDev:')[0].strip() #Average score across all ratings
                beer_dev_prc = beer_avg_dev_glued.split('| pDev:')[1].strip()[:-1] #Percentage value of deviation from average score

                #all thing scraped for beers
                # subs_beers_name #Name of beer
                # beer_s_name #Name of main style of beer
                # beer_style #Name of substyle of beer
                # beer_brewery_name #Name of brewery which the beer comes from
                # beer_country #Name of country which the beer comes from
                # beer_state #Name of state which the beer comes from
                # beer_rank_in_style #
                # beer_score
                # beer_rank
                # beer_abv
                # beer_avg
                # beer_dev_prc
                # beer_ratings #Number of ratings for this beer
                # beer_active #Current activity status
                # beer_added #When the beer was added
                # beer_wants #Number of users who want this beer
                # beer_gots #Number of users who have this beer
                
                #getting a brewery soup
                brewery_web = requests.get(subs_brews_link).text
                brewery_soup = BeautifulSoup(brewery_web, 'lxml')
                brewery = brewery_soup.find('div', class_ = 'break')

                #scraping breweries by tag 'a'
                brewery_a_tag = brewery.find_all('a', href=True)
                brewery_a_tag_text = [item.text for item in brewery_a_tag]
                brewery_a_tag_links = [item['href'] for item in brewery_a_tag]
                if len(brewery_a_tag_text) < 5:
                    length = 5 - len(brewery_a_tag_text)
                    for x in range(length):
                        brewery_a_tag_text.insert(1, "")
                        brewery_a_tag_links.insert(1, "")

                brewery_a_tag_scraped = brewery_a_tag_text[:3] + brewery_a_tag_links[-2:]
                brewery_city, brewery_state, brewery_country, brewery_location, brewery_website = [brewery_a_tag_scraped[i] for i, x in enumerate(brewery_a_tag_scraped)]
                #things scraped from breweries by tag 'a'
                # brewery_city
                # brewery_state
                # brewery_country
                # brewery_location
                # brewery_website

                #scraping rest thing from breweries
                brewery_info_list = brewery.text.split('\n') #split on new line symbol
                brewery_info_list= [x for x in brewery_info_list if x != ''] #deleting empty strings
                brewery_type = brewery_info_list[0].strip().split(',')
                brewery_notes = "".join(brewery_info_list).split("Notes:")[-1].strip()
                brewery_rest = brewery_info_list[1]
                brewery_adress = brewery_rest.split(brewery_city)[0]
                brewery_city_state_concat = brewery_city+ ", " + brewery_state
                if brewery_country != "" and brewery_city_state_concat != "":
                    brewery_postal_code = brewery_rest.split(brewery_city_state_concat)[-1].split(brewery_country)[0].replace(", ","")
                    brewery_telephone = brewery_rest.split(brewery_country)[-1].split("|")[0].strip()
                else:
                    brewery_postal_code = np.nan
                    brewery_telephone = np.nan
                # rest things scraped from breweries
                # brewery_type
                # brewery_adress
                # brewery_postal_code
                # brewery_telephone
                # brewery_notes
                
                #save data to dataframe
                data_list = [subs_beers_name, beer_s_name, beer_style, beer_country, beer_state, beer_rank_in_style,
                             beer_score, beer_rank, beer_abv, beer_avg, beer_dev_prc, beer_ratings, beer_active, beer_added,
                             beer_wants, beer_gots, beer_brewery_name, brewery_city, brewery_state, brewery_country,
                             brewery_location, brewery_website, brewery_type, brewery_adress, brewery_postal_code,
                             brewery_telephone, brewery_notes]
                df_scraped.loc[i] = data_list
                i += 1

NameError: name 'pd' is not defined

### Export scraped data to json file

In [169]:
#export data to json
json_object = df_scraped.to_json(orient="index")
json_parsed = json.loads(json_object)
with open("scraped_data_x.json", "w") as outfile:
    json.dump(json_object, outfile)

### Importing all generated json files into one dataframe

In [36]:
#ze względu na zaniki komunikacji ze stroną zescrapowano dane w ośmiu osobnych plikach
df_list = []
dirs = os.listdir('./json_files')

for file in dirs:
    if file.endswith(".json"):
        with open('./json_files/' + file, 'r') as f:
            json_file = json.load(f)
        data = pd.read_json(json_file).transpose()
    df_list.append(data)

df_json = pd.concat(df_list)
df_json.reset_index(drop=True, inplace=True)

### Clearing scraped data

In [65]:
#
df = df_json.copy() #copy of loaded df from json file
pd.set_option('display.max_columns', 27) #show all columns
pd.set_option('display.max_rows', 50)
df = df.fillna(value = np.nan)
df = df.replace(r'^\s*$', np.nan, regex=True)

#cleaning trumer.at records
mask_trumer_web = df['brewery_city'] == 'trumer.at'
df['brewery_website'].loc[mask_trumer_web] = 'http://trumer.at'
df['brewery_city'].loc[mask_trumer_web] = 'Obertrum '
df['beer_country'].loc[mask_trumer_web] = 'Austria'
df['brewery_adress'].loc[mask_trumer_web] = 'Josef Sigl e.U. Brauhausgasse 2'
df['brewery_postal_code'].loc[mask_trumer_web] = 'A-5162'
df['brewery_telephone'].loc[mask_trumer_web] = '+43 6219 74 11 – 0'
df['brewery_map'].loc[mask_trumer_web] = 'https://goo.gl/maps/3TWfQjgsKWvKrNCR9'

#cleaning utahbeers
mask_utahbers_web = df['brewery_website'] == 'http://utahbeers.com'
df['beer_country'].loc[mask_utahbers_web] = 'United States'
df['beer_state'].loc[mask_utahbers_web] = 'Utah'
df['brewery_city'].loc[mask_utahbers_web] = 'Salt Lake City'
df['brewery_map'].loc[mask_utahbers_web] = 'https://goo.gl/maps/bVYAnPASmfVFbKgPA'
df['brewery_adress'].loc[mask_utahbers_web] = '1763 S 300 W'
df['brewery_postal_code'].loc[mask_utahbers_web] = '84115'
df['brewery_telephone'].loc[mask_utahbers_web] = '(801) 466-8855'

#cleaning countries and states
croatia_mask = df['beer_country'] == 'Croatia (Hrvatska)'
df['beer_state'].loc[croatia_mask] = np.nan
df['brewery_country'] = df['beer_country']
df['brewery_state'] = df['beer_state']

#cleaning bad scraped brewery telephone (contained few informations for certain records)
country_list = list(df['beer_country'].unique())
mask_notes_http = df['brewery_notes'].str.contains("http", na=False)

for s in country_list:
    mask_bad_tel = df['brewery_telephone'].str.contains(s, na=False, regex=False)
    df['brewery_map'].loc[mask_bad_tel] = df['brewery_website']
    df['brewery_website'].loc[mask_bad_tel] = np.nan
    df['brewery_website'].loc[mask_bad_tel].loc[mask_notes_http] = df['brewery_notes']
    df['brewery_postal_code'].loc[mask_bad_tel] = df['brewery_telephone'].loc[mask_bad_tel].str.split(s, regex=False).str[0].str.replace(',','').str.strip()
    df['brewery_telephone'].loc[mask_bad_tel] = df['brewery_telephone'].loc[mask_bad_tel].str.split(s, regex=False).str[-1].str.replace(',','').str.strip()

#cleaning rank in style
df['rank_in_style'] = df['rank_in_style'].str.replace(',','') #removing ',' from notation like 12,345.6
df['rank_in_style'] = df['rank_in_style'].astype(float)

#cleaning score
df['score'] = df['score'].str.replace(',','')
df['score'] = df['score'].astype(float)

#cleaning rank
df['rank'] = df['rank'].str.replace(',','')
df['rank'] = df['rank'].astype(float)

#cleaning abv
mask_abv = df['abv [%]'].str.contains('Not liste', na=False)
df['abv [%]'].loc[mask_abv] = np.nan
df['abv [%]'] = df['abv [%]'].astype(float)
df['abv [%]'] = df['abv [%]'].round(1)

#avg and deviation from avg of score
df['avg_score'] = df['avg_score'].astype(float)
df['deviation_score [%]'] = df['deviation_score [%]'].astype(float)

#cleaning rating
df['ratings'] = df['ratings'].str.replace(',','')
df['ratings'] = df['ratings'].astype(int)

#cleaning active
mask_active = df['active'] == 'Active'
df['active'].loc[mask_active] = 1
mask_retired = df['active'] == 'Retired'
df['active'].loc[mask_retired] = 0

#cleaning who wants and who have
df['who_wants'] = df['who_wants'].str.replace(',','')
df['who_wants'] = df['who_wants'].astype(int)
df['who_have'] = df['who_have'].str.replace(',','')
df['who_have'] = df['who_have'].astype(int)

#cleaning date
mask_minute = df['date_added'].str.contains('minute', na=False, regex=False)
df['date_added'].loc[mask_minute] = 'Today'
mask_moment = df['date_added'].str.contains('moment', na=False, regex=False)
df['date_added'].loc[mask_moment] = 'Today'

mask_data = df['date_added'].str.contains('at', na=False, regex=False)
df['date_added'].loc[mask_data] = df['date_added'].loc[mask_data].str.split(' at').str[0].str.strip()
day_dict = {'Today': 'Jun 29, 2022',
            'Yesterday': 'Jun 28, 2022',
            'Monday': 'Jun 27, 2022',
            'Sunday': 'Jun 26, 2022',
            'Saturday': 'Jun 25, 2022',
            'Friday': 'Jun 24, 2022',
            'Thursday': 'Jun 23, 2022',
            'Wednesday': 'Jun 22, 2022',
            'Tuesday': 'Jun 21, 2022'}
for key in day_dict:
    df['date_added'] = df['date_added'].replace(key, day_dict[key])
df['date_added'] = df['date_added'].apply(format_date)

#cleaning brewery map and website
maps_in_website_mask = df['brewery_website'].str.contains('maps', na=False, regex=False)
df['brewery_map'].loc[maps_in_website_mask] = df['brewery_website']
df['brewery_website'].loc[maps_in_website_mask] = np.nan
not_a_map_in_maps = df['brewery_map'].str.contains('maps', na=False, regex=False) == False
df['brewery_website'].loc[not_a_map_in_maps] = df['brewery_map']
df['brewery_map'].loc[not_a_map_in_maps] = np.nan

#cleaning city
df['brewery_city'] = df['brewery_city'].str.title().str.strip()

#cleaning brewery 
df['brewery_name'] = df['brewery_name'].str.title().str.strip()

#cleaning postal code
code_mask = df['brewery_postal_code'].str.len() > 20
df['brewery_postal_code'].loc[code_mask] = np.nan

### Write a dataframe into hdf file, showing head of cleanded DataFrame

In [67]:
df.to_hdf('./hdf_files/cleaned_beers_df.h5', key='df', mode='w')
df.head()

Unnamed: 0,beer_name,name of style,name_of_substyle,beer_country,beer_state,rank_in_style,score,rank,abv [%],avg_score,deviation_score [%],ratings,active,date_added,who_wants,who_have,brewery_name,brewery_city,brewery_state,brewery_country,brewery_map,brewery_website,brewery_type,brewery_adress,brewery_postal_code,brewery_telephone,brewery_notes
0,Ayinger Celebrator,Bocks,Bock - Doppelbock,Germany,,1.0,96.0,879.0,6.7,4.34,9.91,6978,1,2001-01-10,623,1142,Ayinger Privatbrauerei,Aying,,Germany,https://maps.google.com/maps?oi=map&q=M%C3%BCn...,http://ayinger-bier.de,"[Brewery, Bar, Eatery]",Münchener Straße 21,85653,08095-90650,
1,Troegenator,Bocks,Bock - Doppelbock,United States,Pennsylvania,48.0,88.0,13560.0,8.2,3.95,11.65,3516,1,2003-02-12,163,845,Tröegs Brewing Company,Hershey,Pennsylvania,United States,https://maps.google.com/maps?oi=map&q=200+East...,http://troegs.com,"[Brewery, Bar, Eatery, Beer-to-go]",200 East,17033,(717) 534-1297,Sunday-Wednesday 11am-9pmThursday - sat 11am-1...
2,Spaten Optimator,Bocks,Bock - Doppelbock,Germany,,75.0,87.0,16263.0,7.6,3.9,12.05,3264,1,1999-04-06,114,518,Spaten-Franziskaner-Bräu,München,,Germany,https://maps.google.com/maps?oi=map&q=Mars+Str...,http://franziskaner-weissbier.de,[Brewery],Mars Strasse 46-48,80335,(089) 51 221,OWNED BY ANHEUSER-BUSCH INBEVSee also: http://...
3,Salvator,Bocks,Bock - Doppelbock,Germany,,57.0,88.0,14654.0,7.9,3.93,11.7,3047,1,2001-04-23,111,511,Paulaner Brauerei,München,,Germany,https://maps.google.com/maps?oi=map&q=Hochstra...,http://paulaner.de,[Brewery],Hochstraße 75,81541,089 / 4 80 051,
4,Weihenstephaner Korbinian,Bocks,Bock - Doppelbock,Germany,,5.0,93.0,2924.0,7.4,4.2,9.05,2969,1,2001-09-12,251,398,Bayerische Staatsbrauerei Weihenstephan,Freising,,Germany,https://maps.google.com/maps?oi=map&q=Alte+Aka...,http://weihenstephaner.de,"[Brewery, Bar, Eatery, Beer-to-go]",Alte Akademie 2,85354,+49 8161 5360,


### Connecting to SFTP server and put the hdf file into it

In [68]:
import pysftp


sftpHost = "localhost"
sftpPort = 22
uname = "wojci"
privateKeyFilePath = "./id_rsa"

cnOpts = pysftp.CnOpts()
cnOpts.hostkeys = None

with pysftp.Connection(host=sftpHost, username=uname, private_key=privateKeyFilePath, cnopts=cnOpts) as sftp:
    print("Connected to sftp server")
    sftp.cwd("./beer_files")
    sftp.put("./hdf_files/cleaned_beers_df.h5", "./beer_files.h5", preserve_mtime=True)
    print(sftp.listdir())

Connected to sftp server
['beer_files.h5']
