In [54]:
import bs4
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import numpy as np
import os
from time import time
import smtplib
import locale
locale.setlocale( locale.LC_ALL, '' ) 

'English_United States.1252'

### 1. Scrape beer style list

In [231]:
# Scrape list of beer styles from beeradvocate.com
if os.path.exists("styles.csv"):
    print("Style list is already scraped")

else:
    url="https://www.beeradvocate.com/beer/style/"
    r = requests.get(url)
    soup = BeautifulSoup(r.content,"lxml")

    styles=soup.find_all(href=re.compile("/beer/style/"))
    names=[]
    links=[]
    for style in styles:
        name=style.contents[0]
        link="https://www.beeradvocate.com"+style.get("href")
        names.append(name)
        links.append(link)

    links=links[2:-1]
    names=names[2:-1]
    style_ids=[link.split("/")[-2] for link in links]

    style_table=pd.DataFrame({"style":pd.Series(names),"link":pd.Series(links),"id":pd.Series(style_ids)})
    style_table.to_csv("styles.csv", encoding="utf-8")

Style list is already scraped


### 2. Scrape beer list by style

In [234]:
# Read pre-saved list of beer styles
styles=pd.read_csv("styles.csv", encoding="utf-8")
links=styles["link"]
names=styles["style"]
ids=styles["id"]

In [235]:
# Scrape beers of each styles from beeradvocate.com
style_id=0

while style_id<len(names):
    style_url=links[style_id]
    style_name = "style_"+str(ids[style_id])+".csv"
    if os.path.exists(style_name):
        print("{} already existed".format(style_name))
        next
    else:
        r = requests.get(style_url)
        soup = BeautifulSoup(r.content,"lxml")
        print("Start parsing {}".format(names[style_id]))

        indexs=soup.find_all(href=re.compile(style_url[29:]+"\?sort\=revsD\&start\="))
        for ind in indexs:
            if ind.contents[0]=="last":
                last_page=int(ind.get("href").split("=")[-1])
                break
        print("Total items: {}".format(last_page))

        beer_names=[]
        beer_ids=[]
        companys=[]
        company_ids=[]
        abvs=[]
        ratings=[]
        scores=[]
        beer_links=[]

        page=0

        while page*50<=last_page:
            beer_url=style_url+"?sort=revsD&start="+str(page*50)
            r = requests.get(beer_url)
            soup = BeautifulSoup(r.content,"lxml")
            beers=soup.find_all(class_="hr_bottom_light")
            assert len(beers)%5==0
            for i in range(int(len(beers)/5)):
                beer_name = beers[i*5].contents[0].contents[0].contents[0]
                beer_id = beers[i*5].contents[0].get("href").split("/")[-2]
                company = beers[i*5+1].contents[0].contents[0]
                company_id = beers[i*5].contents[0].get("href").split("/")[-3]
                abv = beers[i*5+2].contents[0].contents[0]
                rating = beers[i*5+3].contents[0].contents[0]
                score = beers[i*5+4].contents[0].contents[0]
                beer_link="https://www.beeradvocate.com/beer/profile/"+company_id+"/"+beer_id+"/"

                beer_names.append(beer_name)
                beer_ids.append(beer_id)
                companys.append(company)
                company_ids.append(company_id)
                abvs.append(abv)
                ratings.append(rating)
                scores.append(score)
                beer_links.append(beer_link)

        page+=1

        df=pd.DataFrame({"beer_names":pd.Series(beer_names),"beer_ids":pd.Series(beer_ids),"companys":pd.Series(companys),
                         "company_ids":pd.Series(company_ids),"abvs":pd.Series(abvs),"ratings":pd.Series(ratings),"scores":pd.Series(scores),
                        "beer_links":pd.Series(beer_links)})

        df.to_csv(style_name, encoding="utf-8")
        print("{} completed".format(style_name))
        print("Total beers: {}".format(df.shape[0]))
    style_id+=1

style_128.csv already existed
style_19.csv already existed
style_175.csv already existed
style_99.csv already existed
style_73.csv already existed
style_94.csv already existed
style_140.csv already existed
style_157.csv already existed
style_116.csv already existed
style_97.csv already existed
style_93.csv already existed
style_159.csv already existed
style_158.csv already existed
style_78.csv already existed
style_171.csv already existed
style_130.csv already existed
style_163.csv already existed
style_6.csv already existed
style_72.csv already existed
style_12.csv already existed
style_60.csv already existed
style_119.csv already existed
style_174.csv already existed
style_54.csv already existed
style_56.csv already existed
style_55.csv already existed
style_141.csv already existed
style_127.csv already existed
style_57.csv already existed
style_15.csv already existed
style_52.csv already existed
style_53.csv already existed
style_14.csv already existed
style_10.csv already existed
s

### 3. Scrape brewery info

In [236]:
# Read pre-saved list of beer styles
styles=pd.read_csv("styles.csv", encoding="utf-8")
names=styles["style"]
ids=styles["id"]


# Get unique company lists
style_id=0

company_ids=set([])

while style_id<len(names):
    style_name = "style_"+str(ids[style_id])+".csv"
    df=pd.read_csv(style_name, encoding="utf-8")
    company_ids.update(df["company_ids"].tolist())
    style_id+=1

company_ids=list(company_ids)

df=pd.DataFrame({"company_ids":pd.Series(company_ids)})

df.to_csv("company_ids.csv",encoding="utf-8")
print("{} companies recorded".format(len(company_ids)))

13939 companies recorded


In [239]:
company_ids=pd.read_csv("company_ids.csv",encoding="utf-8")["company_ids"].tolist()

try:
    df=pd.read_csv("company.csv",encoding="utf-8")

    company_urls=df["company_urls"].tolist()
    company_names=df["company_names"].tolist()
    location1s=df["location1s"].tolist()
    location2s=df["location2s"].tolist()

    ba_scores=df["ba_scores"].tolist()
    ba_levels=df["ba_levels"].tolist()
    total_ratings=df["total_ratings"].tolist()

    beer_currents=df["beer_currents"].tolist()
    total_reviews=df["total_reviews"].tolist()
    
except FileNotFoundError:
    company_urls=[]
    company_names=[]
    location1s=[]
    location2s=[]

    ba_scores=[]
    ba_levels=[]
    total_ratings=[]

    beer_currents=[]
    total_reviews=[]
    
    
i=len(company_urls)
print("{} companies information already scraped".format(i))

while i<len(company_ids):
    comp_id=company_ids[i]
    company_url="https://www.beeradvocate.com/beer/profile/"+str(comp_id)+"/"
    r = requests.get(company_url)
    soup = BeautifulSoup(r.content,"lxml")
    
    company_info=soup.find_all(attrs={"name": "keywords"}) [0].get("content")
    score_summary=soup.find_all(id="score_box")[0].contents
    score_detail=soup.find_all(id="item_stats")[0].contents[1].contents

    company_name=company_info.split(",")[0]
    try:
        location1=company_info.split(",")[1]
    except IndexError:
        location1="NA"   

    try:
        location2=company_info.split(",")[2]
    except IndexError:
        location2="NA"

    ba_score=score_summary[6].contents[0].contents[0]
    try:
        ba_level=score_summary[10].contents[0]
    except IndexError:
        ba_level="NA"
    total_rating=score_summary[-1].split("\t")[0]

    beer_current=score_detail[3].contents[0]
    total_review=score_detail[7].contents[0]
    
    company_urls.append(company_url)
    company_names.append(company_name)
    location1s.append(location1)
    location2s.append(location2)
    ba_scores.append(ba_score)
    ba_levels.append(ba_level)
    total_ratings.append(total_rating)
    beer_currents.append(beer_current)
    total_reviews.append(total_review)
    print("Finished: {}".format(i))
    i+=1

    
companys=pd.DataFrame({"company_ids":pd.Series(company_ids[0:len(ba_scores)]),"company_urls":pd.Series(company_urls),"company_names":pd.Series(company_names),
                       "location1s":pd.Series(location1s),"location2s":pd.Series(location2s),
                       "ba_scores":pd.Series(ba_scores),"ba_levels":pd.Series(ba_levels),"total_ratings":pd.Series(total_ratings),
                       "beer_currents":pd.Series(beer_currents),"total_reviews":pd.Series(total_reviews)
                      })

companys.to_csv("company.csv",encoding="utf-8")


13939 companies information already scraped


### 4. Generate dataset summary

In [17]:
# Read pre-saved list of beer styles
styles=pd.read_csv("styles.csv", encoding="utf-8")
names=styles["style"]
ids=styles["id"]

# Get total beer count
count=0
top_rated=0
ratings=0
style_id=0

while style_id<len(names):
    style_name = "style_"+str(ids[style_id])+".csv"
    df=pd.read_csv(style_name, encoding="utf-8")
    count+=df.shape[0]
    ratings+=sum([locale.atoi(str(num)) for num in df["ratings"].tolist()])
    top_rated+=sum([1 for num in df["ratings"].tolist() if locale.atoi(str(num))>500])
    style_id+=1

print("Total style: {}".format(len(names)))
print("Total beer: {}".format(count))
print("Total beer with top ratings: {}".format(top_rated))
print("Total ratings: {}".format(ratings))

Total style: 104
Total beer: 219064
Total beer with top ratings: 2680
Total ratings: 7321589


### 5. Get simplified beer rating datasets

In [8]:
start_id=0
cut_off_num=1000

# Read pre-saved list of beer styles
styles=pd.read_csv("styles.csv", encoding="utf-8")
names=styles["style"]
ids=styles["id"]

# Create directory
folder = "C:\\Users\\yjin9\\Documents\\python-scripts\\data_incubator\\data_scraping_cleaning\\top_rated\\" 
if not os.path.exists(folder):
    os.mkdir(folder)

summary_file_path =os.path.join(folder,"beer_summary.csv")
try:
    beer_summary_df=pd.read_csv(summary_file_path,encoding="utf-8")
    beer_ids_2=beer_summary_df["beer_ids"].tolist()
    ranks=beer_summary_df["beer_ids"].tolist()
    reviews_nums=beer_summary_df["reviews_nums"].tolist()
    pDevs=beer_summary_df["pDevs"].tolist()
    bro_scores=beer_summary_df["bro_scores"].tolist()
    availabilities=beer_summary_df["availabilities"].tolist()
    ibus=beer_summary_df["ibus"].tolist()

except FileNotFoundError:
    beer_ids_2=[]
    ranks=[]
    reviews_nums=[]
    pDevs=[]
    bro_scores=[]
    availabilities=[]
    ibus=[]


### input:start_id 
for style_id in range(start_id,len(ids)):
    print("Start style id {}".format(style_id))
    t00=time()
    style_name = "style_"+str(ids[style_id])+".csv"
    df=pd.read_csv(style_name, encoding="utf-8")
    
    beer_ids = df["beer_ids"]
    beer_links=df["beer_links"]
    beer_ratings=df["ratings"]
    
    counter=0
    for beer_idx in range(len(beer_ids)):
        ### Input cut-off rating numbers, e.g. 1000
        if locale.atoi(str(beer_ratings[beer_idx]))>cut_off_num:
            counter+=1
            t0=time()
            beer_url = beer_links[beer_idx]
            beer_id = beer_ids[beer_idx]
        
            if beer_id in beer_ids_2:
                print("Beer {} already recorded".format(beer_id))
                pass
            
            else:
                print("Start record beer {}".format(beer_id))
                ## create file name for detail rating
                file_path =os.path.join(folder,"beer_"+str(beer_id)+".csv")
                ## get beer stats summary from beer page
                r = requests.get(beer_url)
                soup = BeautifulSoup(r.content,"lxml")
                
                beer_stats=soup.find_all(id="item_stats")[0].find_all("dd")
                rank=beer_stats[0].contents[0][1:]
                reviews_num=beer_stats[1].contents[0].contents[0]
                pDev=beer_stats[3].contents[0].contents[0].strip()
                bro_score=beer_stats[4].contents[0].contents[-2].contents[0]

                beer_info = soup.find_all(id="info_box")[0].contents
                try:
                    availability=beer_info[37].strip()
                except TypeError:
                    availability="N/A"

                ibu=beer_info[-7].strip()
                if ibu[-3:]!="IBU":
                    ibu="N/A"

                beer_ids_2.append(beer_id)
                ranks.append(rank)
                reviews_nums.append(reviews_num)
                pDevs.append(pDev)
                bro_scores.append(bro_score)
                availabilities.append(availability)
                ibus.append(ibu)
        
                ## get beer comments - ratings only
                user_ids=[]
                ba_scores=[]
                rDevs=[]
                looks=[]
                smells=[]
                tastes=[]
                feels=[]
                overalls=[]

                comment_end = soup.find_all(attrs={"style":"font-weight:bold;"})[0].find_all(href=re.compile("/beer/profile/"))[-1].get("href").split("=")[-1]
                comment_end = int(comment_end)

                comment_page=0
                while comment_page<=comment_end//25: 
                    comment_url =  beer_url+"?view=beer&sort=&start="+str(comment_page*25)
                    r = requests.get(comment_url)
                    soup = BeautifulSoup(r.content,"lxml")
                    comments=soup.find_all(id="rating_fullview_container")
                    for i in range(len(comments)):
                        user_id=comments[i].get("ba-user")

                        detail_rating=comments[i].find_all(id="rating_fullview_content_2")[0].contents
                        ba_score=detail_rating[0].contents[0]
                        if detail_rating[3].string==None:
                            rDev="0%"
                            by_dim_idx=4
                        else:
                            rDev=detail_rating[3].contents[0]
                            by_dim_idx=5
                        try:
                            by_dim=detail_rating[by_dim_idx].contents[0].split("|")
                            look=by_dim[0].split(":")[-1].strip()
                            smell=by_dim[1].split(":")[-1].strip()
                            taste=by_dim[2].split(":")[-1].strip()
                            feel=by_dim[3].split(":")[-1].strip()
                            overall=by_dim[4].split(":")[-1].strip()
                        except IndexError:
                            look="N/A"
                            smell="N/A"
                            taste="N/A"
                            feel="N/A"
                            overall="N/A"                 

                        user_ids.append(user_id)
                        ba_scores.append(ba_score)
                        rDevs.append(rDev)
                        looks.append(look)
                        smells.append(smell)
                        tastes.append(taste)
                        feels.append(feel)
                        overalls.append(overall)
                    comment_page+=1

                beer_df=pd.DataFrame({"user_id":pd.Series(user_ids),"ba_score":pd.Series(ba_scores),
                                      "rDev":pd.Series(rDevs),"look":pd.Series(looks),"smell":pd.Series(smells),"taste":pd.Series(tastes),
                                      "feel":pd.Series(feels),"overall":pd.Series(overalls)})
                beer_df.to_csv(file_path,encoding="utf-8")

                print("Finish beer id {}".format(beer_idx))
                print("{} sec".format(round(time()-t0)))

                beer_summary_df=pd.DataFrame({"beer_ids":pd.Series(beer_ids_2),"ranks":pd.Series(ranks),"reviews_nums":pd.Series(reviews_nums),
                                              "pDevs":pd.Series(pDevs),"bro_scores":pd.Series(bro_scores),
                                              "availabilities":pd.Series(availabilities),"ibus":pd.Series(ibus)})
                beer_summary_df.to_csv(summary_file_path,encoding="utf-8")
                
                # end if
            
            # end if
    
    print("All beers in style id {} are done".format(style_id))
    print("{} in total".format(counter))
    minutes=round((t00-time())/60)
    print("{} mins".format(minutes))


Start style id 0
Beer 607 already recorded
Beer 15881 already recorded
Beer 35732 already recorded
Beer 794 already recorded
Beer 6322 already recorded
Beer 98495 already recorded
Beer 3711 already recorded
Beer 35397 already recorded
Beer 1792 already recorded
Beer 1655 already recorded
Beer 54522 already recorded
Beer 1173 already recorded
Beer 410 already recorded
Beer 1490 already recorded
Beer 623 already recorded
Beer 2526 already recorded
Beer 691 already recorded
Beer 61877 already recorded
Beer 363 already recorded
Beer 2297 already recorded
Beer 95921 already recorded
Beer 18721 already recorded
Beer 213 already recorded
Beer 104466 already recorded
Beer 199848 already recorded
Beer 1597 already recorded
All beers in style id 0 are done
26 in total
0 mins
Start style id 1
Beer 2671 already recorded
Beer 1891 already recorded
Beer 6533 already recorded
Beer 20931 already recorded
Beer 41300 already recorded
Beer 78551 already recorded
Beer 1009 already recorded
Beer 4810 alrea

Beer 50564 already recorded
Beer 73764 already recorded
Beer 95386 already recorded
Beer 3916 already recorded
Beer 11922 already recorded
Beer 32286 already recorded
Beer 86237 already recorded
Beer 57252 already recorded
Beer 6549 already recorded
Beer 71969 already recorded
Beer 25755 already recorded
Beer 111616 already recorded
Beer 1493 already recorded
Beer 111969 already recorded
Beer 665 already recorded
Beer 53886 already recorded
Beer 15758 already recorded
Beer 65347 already recorded
Beer 205 already recorded
Beer 34688 already recorded
Beer 5385 already recorded
Beer 148052 already recorded
Beer 20781 already recorded
Beer 611 already recorded
Beer 2729 already recorded
Beer 35626 already recorded
Beer 55862 already recorded
Beer 29209 already recorded
Beer 42533 already recorded
Beer 91677 already recorded
Beer 58610 already recorded
Beer 88969 already recorded
Beer 48243 already recorded
Beer 66674 already recorded
Beer 9088 already recorded
Beer 84747 already recorded
B

Beer 1376 already recorded
Beer 169625 already recorded
All beers in style id 17 are done
4 in total
0 mins
Start style id 18
Beer 38394 already recorded
Beer 6260 already recorded
Beer 100 already recorded
Beer 25880 already recorded
Beer 1932 already recorded
Beer 45844 already recorded
Beer 32767 already recorded
Beer 74 already recorded
Beer 11819 already recorded
Beer 52248 already recorded
Beer 1567 already recorded
Beer 97291 already recorded
Beer 62449 already recorded
Beer 47020 already recorded
Beer 73731 already recorded
Beer 7077 already recorded
Beer 72412 already recorded
Beer 643 already recorded
Beer 25608 already recorded
Beer 33405 already recorded
Beer 71325 already recorded
All beers in style id 18 are done
21 in total
0 mins
Start style id 19
Beer 16074 already recorded
Beer 43223 already recorded
Beer 56168 already recorded
Beer 30502 already recorded
Beer 16115 already recorded
All beers in style id 19 are done
5 in total
0 mins
Start style id 20
Beer 51069 alrea

Beer 79898 already recorded
Beer 40674 already recorded
Beer 88889 already recorded
Beer 65809 already recorded
All beers in style id 39 are done
5 in total
0 mins
Start style id 40
All beers in style id 40 are done
0 in total
0 mins
Start style id 41
Beer 100421 already recorded
Beer 41121 already recorded
Beer 18199 already recorded
Beer 1566 already recorded
Beer 46230 already recorded
Beer 1445 already recorded
Beer 50509 already recorded
Beer 49472 already recorded
Beer 705 already recorded
Beer 18093 already recorded
Beer 2678 already recorded
All beers in style id 41 are done
11 in total
0 mins
Start style id 42
Beer 1157 already recorded
All beers in style id 42 are done
1 in total
0 mins
Start style id 43
Beer 639 already recorded
Beer 576 already recorded
Beer 6 already recorded
Beer 47731 already recorded
Beer 22343 already recorded
Beer 9900 already recorded
Beer 50082 already recorded
Beer 1169 already recorded
All beers in style id 43 are done
8 in total
0 mins
Start styl

Beer 402 already recorded
Beer 28039 already recorded
Beer 45765 already recorded
Beer 94714 already recorded
Beer 47440 already recorded
Beer 39258 already recorded
Beer 16400 already recorded
Beer 846 already recorded
Beer 118299 already recorded
Beer 36 already recorded
Beer 79 already recorded
Beer 96677 already recorded
All beers in style id 71 are done
15 in total
0 mins
Start style id 72
Beer 1159 already recorded
Beer 1163 already recorded
Beer 12719 already recorded
Beer 740 already recorded
Beer 34069 already recorded
All beers in style id 72 are done
5 in total
0 mins
Start style id 73
Beer 245 already recorded
All beers in style id 73 are done
1 in total
0 mins
Start style id 74
Beer 65 already recorded
Beer 1331 already recorded
Beer 232 already recorded
Beer 580 already recorded
Beer 689 already recorded
Beer 1276 already recorded
Beer 567 already recorded
Beer 1907 already recorded
Beer 2280 already recorded
Beer 1321 already recorded
Beer 1371 already recorded
Beer 918 

### 6. Scrape simplified user info

In [84]:
folder = "C:\\Users\\yjin9\\Documents\\python-scripts\\data_incubator\\data_scraping_cleaning\\top_rated\\" 

summary_file_path =os.path.join(folder,"beer_summary"+".csv")
beer_ids=pd.read_csv(summary_file_path,encoding="utf-8")["beer_ids"].tolist()

total_user_ids=[]

for beer_idx in range(len(beer_ids)):
    file_path =os.path.join(folder,"beer_"+str(beer_ids[beer_idx])+".csv")
    user_ids=pd.read_csv(file_path,encoding="utf-8")["user_id"].tolist()
    user_ids=[str(int(i)) for i in user_ids if not np.isnan(i)]
    total_user_ids=total_user_ids+user_ids

unique_user_ids=set(total_user_ids)
print("{} unique unsers extracted".format(len(unique_user_ids)))


124439 unique unsers extracted


In [102]:
uid_df=pd.DataFrame({"uid":pd.Series(total_user_ids),"uid_count":pd.Series(total_user_ids)})
uid_df.set_index("uid_count")
uid_count=uid_df.groupby("uid").agg("count").sort_values(by="uid_count",ascending=False)


uid_count.to_csv(os.path.join(folder,"uid_count.csv"),encoding="utf-8")
sum(uid_count["uid_count"]>100)

8999

In [138]:
unique_user_ids=pd.read_csv(os.path.join(folder,"uid_count.csv"),encoding="utf-8")["uid"].tolist()
print("{} unique unsers extracted".format(len(unique_user_ids)))

try:
    user_df=pd.read_csv(os.path.join(folder,"uid_info.csv"),encoding="utf-8")
    user_id_2=user_df["user_id"].tolist()
    user_names=user_df["user_name"].tolist()
    join_dates=user_df["join_date"].tolist()
    beer_karmas=user_df["beer_karma"].tolist()
    beer_markeds=user_df["beer_marked"].tolist()
    sexs=user_df["sex"].tolist()
    locations=user_df["location"].tolist()

except FileNotFoundError:
    user_id_2=[]
    user_names=[]
    join_dates=[]
    beer_karmas=[]
    beer_markeds=[]
    sexs=[]
    locations=[]
    
err_counter=0

for i in range(len(unique_user_ids)):
    user_id=str(int(unique_user_ids[i]))
    if int(user_id) in user_id_2:
        pass
    
    else:
        user_url="https://www.beeradvocate.com/community/members/"+user_id
        r = requests.get(user_url)
        soup = BeautifulSoup(r.content,"lxml")

        try:
            print("start saving {}".format(i))
            user_link=soup.find_all(attrs={"property":"og:url"})[0].get("content")
            user_name=user_link.split("/")[-2].split(".")[0]

            stat=soup.find_all(class_="section infoBlock")[0].find_all("dd")
            join_date=stat[-6].string.strip()
            beer_karma=stat[-5].string
            beer_marked=stat[-4].string

            user_names.append(user_name)
            join_dates.append(join_date)
            beer_karmas.append(beer_karma)
            beer_markeds.append(beer_marked)


            info=soup.find_all(id="info")[0].find_all(class_="pairsColumns aboutPairs")[0].find_all("dd")
            sex=info[0].string
            location=info[-1].string
            sexs.append(sex)
            locations.append(location)
            user_id_2.append(user_id)

        except IndexError:
            err_counter+=1

        if i%100==0:
            user_df=pd.DataFrame({"user_id":pd.Series(user_id_2),"user_name":pd.Series(user_names),"join_date":pd.Series(join_dates),
                                  "beer_karma":pd.Series(beer_karmas),"beer_marked":pd.Series(beer_markeds),"sex":pd.Series(sexs),
                                  "location":pd.Series(locations)})
            user_df.to_csv(os.path.join(folder,"uid_info.csv"),encoding="utf-8")
            print("{} users saved".format(i))
            print("{} errors".format(err_counter))
    
print("all user information saved")

124439 unique unsers extracted
start saving 12
start saving 14
start saving 18
start saving 20
start saving 21
start saving 22
start saving 33
start saving 35
start saving 51
start saving 52
start saving 54
start saving 56
start saving 61
start saving 65
start saving 71
start saving 83
start saving 87
start saving 88
start saving 94
start saving 96
start saving 105
start saving 106
start saving 117
start saving 120
start saving 125
start saving 130
start saving 138
start saving 148
start saving 151
start saving 163
start saving 166
start saving 168
start saving 172
start saving 174
start saving 191
start saving 205
start saving 212
start saving 218
start saving 219
start saving 228
start saving 243
start saving 246
start saving 247
start saving 260
start saving 262
start saving 265
start saving 272
start saving 279
start saving 290
start saving 291
start saving 295
start saving 297
start saving 300
300 users saved
53 errors
start saving 305
start saving 324
start saving 329
start savin

start saving 975
start saving 976
start saving 977
start saving 978
start saving 979
start saving 980
start saving 981
start saving 982
start saving 983
start saving 984
start saving 985
start saving 986
start saving 987
start saving 988
start saving 989
start saving 990
start saving 991
start saving 992
start saving 993
start saving 994
start saving 995
start saving 996
start saving 997
start saving 998
start saving 999
start saving 1000
1000 users saved
158 errors
start saving 1001
start saving 1002
start saving 1003
start saving 1004
start saving 1005
start saving 1006
start saving 1007
start saving 1008
start saving 1009
start saving 1010
start saving 1011
start saving 1012
start saving 1013
start saving 1014
start saving 1015
start saving 1016
start saving 1017
start saving 1018
start saving 1019
start saving 1020
start saving 1021
start saving 1022
start saving 1023
start saving 1024
start saving 1025
start saving 1026
start saving 1027
start saving 1028
start saving 1029
start s

start saving 1424
start saving 1425
start saving 1426
start saving 1427
start saving 1428
start saving 1429
start saving 1430
start saving 1431
start saving 1432
start saving 1433
start saving 1434
start saving 1435
start saving 1436
start saving 1437
start saving 1438
start saving 1439
start saving 1440
start saving 1441
start saving 1442
start saving 1443
start saving 1444
start saving 1445
start saving 1446
start saving 1447
start saving 1448
start saving 1449
start saving 1450
start saving 1451
start saving 1452
start saving 1453
start saving 1454
start saving 1455
start saving 1456
start saving 1457
start saving 1458
start saving 1459
start saving 1460
start saving 1461
start saving 1462
start saving 1463
start saving 1464
start saving 1465
start saving 1466
start saving 1467
start saving 1468
start saving 1469
start saving 1470
start saving 1471
start saving 1472
start saving 1473
start saving 1474
start saving 1475
start saving 1476
start saving 1477
start saving 1478
start savi

start saving 1873
start saving 1874
start saving 1875
start saving 1876
start saving 1877
start saving 1878
start saving 1879
start saving 1880
start saving 1881
start saving 1882
start saving 1883
start saving 1884
start saving 1885
start saving 1886
start saving 1887
start saving 1888
start saving 1889
start saving 1890
start saving 1891
start saving 1892
start saving 1893
start saving 1894
start saving 1895
start saving 1896
start saving 1897
start saving 1898
start saving 1899
start saving 1900
1900 users saved
267 errors
start saving 1901
start saving 1902
start saving 1903
start saving 1904
start saving 1905
start saving 1906
start saving 1907
start saving 1908
start saving 1909
start saving 1910
start saving 1911
start saving 1912
start saving 1913
start saving 1914
start saving 1915
start saving 1916
start saving 1917
start saving 1918
start saving 1919
start saving 1920
start saving 1921
start saving 1922
start saving 1923
start saving 1924
start saving 1925
start saving 1926


start saving 2321
start saving 2322
start saving 2323
start saving 2324
start saving 2325
start saving 2326
start saving 2327
start saving 2328
start saving 2329
start saving 2330
start saving 2331
start saving 2332
start saving 2333
start saving 2334
start saving 2335
start saving 2336
start saving 2337
start saving 2338
start saving 2339
start saving 2340
start saving 2341
start saving 2342
start saving 2343
start saving 2344
start saving 2345
start saving 2346
start saving 2347
start saving 2348
start saving 2349
start saving 2350
start saving 2351
start saving 2352
start saving 2353
start saving 2354
start saving 2355
start saving 2356
start saving 2357
start saving 2358
start saving 2359
start saving 2360
start saving 2361
start saving 2362
start saving 2363
start saving 2364
start saving 2365
start saving 2366
start saving 2367
start saving 2368
start saving 2369
start saving 2370
start saving 2371
start saving 2372
start saving 2373
start saving 2374
start saving 2375
start savi

start saving 2770
start saving 2771
start saving 2772
start saving 2773
start saving 2774
start saving 2775
start saving 2776
start saving 2777
start saving 2778
start saving 2779
start saving 2780
start saving 2781
start saving 2782
start saving 2783
start saving 2784
start saving 2785
start saving 2786
start saving 2787
start saving 2788
start saving 2789
start saving 2790
start saving 2791
start saving 2792
start saving 2793
start saving 2794
start saving 2795
start saving 2796
start saving 2797
start saving 2798
start saving 2799
start saving 2800
2800 users saved
382 errors
start saving 2801
start saving 2802
start saving 2803
start saving 2804
start saving 2805
start saving 2806
start saving 2807
start saving 2808
start saving 2809
start saving 2810
start saving 2811
start saving 2812
start saving 2813
start saving 2814
start saving 2815
start saving 2816
start saving 2817
start saving 2818
start saving 2819
start saving 2820
start saving 2821
start saving 2822
start saving 2823


start saving 3218
start saving 3219
start saving 3220
start saving 3221
start saving 3222
start saving 3223
start saving 3224
start saving 3225
start saving 3226
start saving 3227
start saving 3228
start saving 3229
start saving 3230
start saving 3231
start saving 3232
start saving 3233
start saving 3234
start saving 3235
start saving 3236
start saving 3237
start saving 3238
start saving 3239
start saving 3240
start saving 3241
start saving 3242
start saving 3243
start saving 3244
start saving 3245
start saving 3246
start saving 3247
start saving 3248
start saving 3249
start saving 3250
start saving 3251
start saving 3252
start saving 3253
start saving 3254
start saving 3255
start saving 3256
start saving 3257
start saving 3258
start saving 3259
start saving 3260
start saving 3261
start saving 3262
start saving 3263
start saving 3264
start saving 3265
start saving 3266
start saving 3267
start saving 3268
start saving 3269
start saving 3270
start saving 3271
start saving 3272
start savi

start saving 3667
start saving 3668
start saving 3669
start saving 3670
start saving 3671
start saving 3672
start saving 3673
start saving 3674
start saving 3675
start saving 3676
start saving 3677
start saving 3678
start saving 3679
start saving 3680
start saving 3681
start saving 3682
start saving 3683
start saving 3684
start saving 3685
start saving 3686
start saving 3687
start saving 3688
start saving 3689
start saving 3690
start saving 3691
start saving 3692
start saving 3693
start saving 3694
start saving 3695
start saving 3696
start saving 3697
start saving 3698
start saving 3699
start saving 3700
3700 users saved
497 errors
start saving 3701
start saving 3702
start saving 3703
start saving 3704
start saving 3705
start saving 3706
start saving 3707
start saving 3708
start saving 3709
start saving 3710
start saving 3711
start saving 3712
start saving 3713
start saving 3714
start saving 3715
start saving 3716
start saving 3717
start saving 3718
start saving 3719
start saving 3720


start saving 4115
start saving 4116
start saving 4117
start saving 4118
start saving 4119
start saving 4120
start saving 4121
start saving 4122
start saving 4123
start saving 4124
start saving 4125
start saving 4126
start saving 4127
start saving 4128
start saving 4129
start saving 4130
start saving 4131
start saving 4132
start saving 4133
start saving 4134
start saving 4135
start saving 4136
start saving 4137
start saving 4138
start saving 4139
start saving 4140
start saving 4141
start saving 4142
start saving 4143
start saving 4144
start saving 4145
start saving 4146
start saving 4147
start saving 4148
start saving 4149
start saving 4150
start saving 4151
start saving 4152
start saving 4153
start saving 4154
start saving 4155
start saving 4156
start saving 4157
start saving 4158
start saving 4159
start saving 4160
start saving 4161
start saving 4162
start saving 4163
start saving 4164
start saving 4165
start saving 4166
start saving 4167
start saving 4168
start saving 4169
start savi

start saving 4564
start saving 4565
start saving 4566
start saving 4567
start saving 4568
start saving 4569
start saving 4570
start saving 4571
start saving 4572
start saving 4573
start saving 4574
start saving 4575
start saving 4576
start saving 4577
start saving 4578
start saving 4579
start saving 4580
start saving 4581
start saving 4582
start saving 4583
start saving 4584
start saving 4585
start saving 4586
start saving 4587
start saving 4588
start saving 4589
start saving 4590
start saving 4591
start saving 4592
start saving 4593
start saving 4594
start saving 4595
start saving 4596
start saving 4597
start saving 4598
start saving 4599
start saving 4600
4600 users saved
575 errors
start saving 4601
start saving 4602
start saving 4603
start saving 4604
start saving 4605
start saving 4606
start saving 4607
start saving 4608
start saving 4609
start saving 4610
start saving 4611
start saving 4612
start saving 4613
start saving 4614
start saving 4615
start saving 4616
start saving 4617


start saving 5012
start saving 5013
start saving 5014
start saving 5015
start saving 5016
start saving 5017
start saving 5018
start saving 5019
start saving 5020
start saving 5021
start saving 5022
start saving 5023
start saving 5024
start saving 5025
start saving 5026
start saving 5027
start saving 5028
start saving 5029
start saving 5030
start saving 5031
start saving 5032
start saving 5033
start saving 5034
start saving 5035
start saving 5036
start saving 5037
start saving 5038
start saving 5039
start saving 5040
start saving 5041
start saving 5042
start saving 5043
start saving 5044
start saving 5045
start saving 5046
start saving 5047
start saving 5048
start saving 5049
start saving 5050
start saving 5051
start saving 5052
start saving 5053
start saving 5054
start saving 5055
start saving 5056
start saving 5057
start saving 5058
start saving 5059
start saving 5060
start saving 5061
start saving 5062
start saving 5063
start saving 5064
start saving 5065
start saving 5066
start savi

KeyboardInterrupt: 

### 7. Scrape full beer rating details

In [16]:
# Read pre-saved list of beer styles
styles=pd.read_csv("styles.csv", encoding="utf-8")
names=styles["style"]
ids=styles["id"]

# Pre-load saved user info
user_urls=set([])
try:
    user_df=pd.read_csv("user_info.csv",encoding="utf-8")
    user_urls=set(user_df["user_url"].tolist())
        
except FileNotFoundError:
    pass


style_id=0

while style_id<len(ids):
    t00=time()
    style_name = "style_"+str(ids[style_id])+".csv"
    style_folder ="C:\\Users\\yjin9\\Documents\\python-scripts\\data_incubator\\data_scraping_cleaning"+ "\\style"+str(ids[style_id])+"\\"
    if not os.path.exists(style_folder):
        os.mkdir(style_folder)
    df=pd.read_csv(style_name, encoding="utf-8")
    
    beer_ids = df["beer_ids"]
    beer_links=df["beer_links"]
    beer_ratings=df["ratings"]

    summary_file_path =os.path.join(style_folder,"style_"+str(ids[style_id])+"_summary"+".csv")
    try:
        beer_summary_df=pd.read_csv(summary_file_path,encoding="utf-8")
        beer_ids_2=beer_summary_df["beer_ids"].tolist()
        ranks=beer_summary_df["beer_ids"].tolist()
        reviews_nums=beer_summary_df["reviews_nums"].tolist()
        pDevs=beer_summary_df["pDevs"].tolist()
        bro_scores=beer_summary_df["bro_scores"].tolist()
        availabilities=beer_summary_df["availabilities"].tolist()
        ibus=beer_summary_df["ibus"].tolist()
        
    except FileNotFoundError:
        beer_ids_2=[]
        ranks=[]
        reviews_nums=[]
        pDevs=[]
        bro_scores=[]
        availabilities=[]
        ibus=[]
    
    # Start from previous 
    beer_idx=len(beer_ids_2)
    while beer_idx<len(beer_ids):
        t0=time()
        beer_url = beer_links[beer_idx]
        beer_id = beer_ids[beer_idx]
        file_path =os.path.join(style_folder,"beer_"+str(beer_id)+".csv")
        
        print("Start beer {}".format(beer_idx))
        ## get beer stats summary from beer page
        r = requests.get(beer_url)
        soup = BeautifulSoup(r.content,"lxml")
        beer_stats=soup.find_all(id="item_stats")[0].find_all("dd")
        
        rank=beer_stats[0].contents[0][1:]
        reviews_num=beer_stats[1].contents[0].contents[0]
        pDev=beer_stats[3].contents[0].contents[0].strip()
        bro_score=beer_stats[4].contents[0].contents[-2].contents[0]
        
        beer_info = soup.find_all(id="info_box")[0].contents
        try:
            availability=beer_info[37].strip()
        except TypeError:
            availability="N/A"
        ibu=beer_info[-7].strip()
        if ibu[-3:]!="IBU":
            ibu="N/A"
        
        beer_ids_2.append(beer_id)
        ranks.append(rank)
        reviews_nums.append(reviews_num)
        pDevs.append(pDev)
        bro_scores.append(bro_score)
        availabilities.append(availability)
        ibus.append(ibu)
        
        ## get beer comments (texts and ratings)
        if locale.atoi(str(beer_ratings[beer_idx]))>0:

            user_ids=[]
            comments_txt=[]
            ba_scores=[]
            rDevs=[]
            looks=[]
            smells=[]
            tastes=[]
            feels=[]
            overalls=[]

            if locale.atoi(str(beer_ratings[beer_idx]))>25:
                comment_end = soup.find_all(attrs={"style":"font-weight:bold;"})[0].find_all(href=re.compile("/beer/profile/"))[-1].get("href").split("=")[-1]
                comment_end = int(comment_end)
            else:
                comment_end = 0
            
            comment_page=0
            while comment_page<=comment_end//25: 
                comment_url =  beer_url+"?view=beer&sort=&start="+str(comment_page*25)
                r = requests.get(comment_url)
                soup = BeautifulSoup(r.content,"lxml")
                comments=soup.find_all(id="rating_fullview_container")
                for i in range(len(comments)):
                    user_id=comments[i].get("ba-user")
                    user_url=comments[i].find_all(class_="username")[0].get("href")
                    user_urls.update([user_url])

                    detail_rating=comments[i].find_all(id="rating_fullview_content_2")[0].contents

                    if len(detail_rating)>9:
                        comment_text="".join([str(item) for item in detail_rating[8:-6] if type(item)==bs4.element.NavigableString])
                    else:
                        comment_text="N/A"

                    ba_score=detail_rating[0].contents[0]
                    if detail_rating[3].string==None:
                        rDev="0%"
                        by_dim_idx=4
                    else:
                        rDev=detail_rating[3].contents[0]
                        by_dim_idx=5
                    try:
                        by_dim=detail_rating[by_dim_idx].contents[0].split("|")
                        look=by_dim[0].split(":")[-1].strip()
                        smell=by_dim[1].split(":")[-1].strip()
                        taste=by_dim[2].split(":")[-1].strip()
                        feel=by_dim[3].split(":")[-1].strip()
                        overall=by_dim[4].split(":")[-1].strip()
                    except IndexError:
                        look="N/A"
                        smell="N/A"
                        taste="N/A"
                        feel="N/A"
                        overall="N/A"                 

                    user_ids.append(user_id)
                    comments_txt.append(comment_text)
                    ba_scores.append(ba_score)
                    rDevs.append(rDev)
                    looks.append(look)
                    smells.append(smell)
                    tastes.append(taste)
                    feels.append(feel)
                    overalls.append(overall)
                #print("-- Comment page {} done".format(comment_page))
                comment_page+=1

            beer_df=pd.DataFrame({"user_id":pd.Series(user_ids),"comments":pd.Series(comments_txt),"ba_score":pd.Series(ba_scores),
                                  "rDev":pd.Series(rDevs),"look":pd.Series(looks),"smell":pd.Series(smells),"taste":pd.Series(tastes),
                                  "feel":pd.Series(feels),"overall":pd.Series(overalls)})
            beer_df.to_csv(file_path,encoding="utf-8")
        
            print("Finish beer {}".format(beer_idx))
            print("{} sec".format(round(time()-t0)))
        
            user_df=pd.DataFrame({"user_url":pd.Series(list(user_urls))})
            user_df.to_csv("user_info.csv",encoding="utf-8")
        else:
            print("Beer {} has 0 rating".format(beer_idx))
        
        beer_idx+=1
    
        ## TO DO beer_summary_df merge with df
        beer_summary_df=pd.DataFrame({"beer_ids":pd.Series(beer_ids_2),"ranks":pd.Series(ranks),"reviews_nums":pd.Series(reviews_nums),"pDevs":pd.Series(pDevs),
                                      "bro_scores":pd.Series(bro_scores),"availabilities":pd.Series(availabilities),"ibus":pd.Series(ibus)})
        beer_summary_df.to_csv(summary_file_path,encoding="utf-8")
    
    print("All beers in style {} are done".format(ids[style_id]))
    assert(beer_summary_df.shape[0]==len(beer_ids))
    
    minutes=round((t00-time())/60)
    print("{} mins".format(minutes))
    style_id+=1
    


All beers in style 128 are done
0 mins
All beers in style 19 are done
0 mins
All beers in style 175 are done
0 mins
All beers in style 99 are done
0 mins
All beers in style 73 are done
0 mins
All beers in style 94 are done
0 mins
Start beer 27
Finish beer 27
104 sec
Start beer 28


KeyboardInterrupt: 

In [None]:
me="xxxxxxxx"
you="xxxxxxxx"
msg="Interrupt!"
password="xxxxxxxx"
server = smtplib.SMTP('smtp.gmail.com:587')
server.ehlo()
server.starttls()
server.login(me,password)
server.sendmail(me, you, msg)
server.quit()

In [201]:
#### User Page

# beers
# beer karma

14998

[Rating explained](https://www.beeradvocate.com/community/threads/beeradvocate-ratings-explained.184726/)

- controversial beer: high pDev

- picky user: negative rDev

[How to review a beer](https://www.beeradvocate.com/community/threads/how-to-review-a-beer.241156/)