# Use BeatifulSoup to scrape red wine data from Vivino using LCBO data

This notebook goes through iterative steps to connect to the Vivino webpage and then scrape salient data associated with all the red wine items based on data scraped from LCBO. The most important information is the name, rating and number of reviews. 

In [1]:
import re
import requests
import time
import random
from bs4 import BeautifulSoup
import pandas as pd
import csv

In [2]:
dfl = pd.read_csv('lcbo_redwine.csv')

In [3]:
dfl['search'] = dfl['name'] + ' ' + dfl['region']

In [4]:
len(list(dfl['search']))

6089

In [32]:
url = 'https://www.vivino.com/search/wines?q={kw}&start={page}'
prices_url = 'https://www.vivino.com/prices'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}

def get_wines(kw):
    with requests.session() as s:
        page = 1
        soup = BeautifulSoup(s.get(url.format(kw=kw, page=page), headers=headers).content, 'html.parser')
        params = [wc['data-vintage'] for wc in soup.select('.default-wine-card')]
        print(str(params[0]))
        
        title = soup.find('div', attrs={'class': 'default-wine-card vintage-price-id-'+str(params[0])})\
        .find('span', attrs={'class': 'bold'}).get_text().strip()
        
        region, country = soup.find('div', attrs={'class': 'default-wine-card vintage-price-id-'+str(params[0])}).\
        find('span', attrs={'class': 'text-block wine-card__region'}).get_text().strip().split('\n·\n')
        
        score = soup.find('div', attrs={'class': 'default-wine-card vintage-price-id-'+str(params[0])}).\
        find('div', attrs={'class': 'text-inline-block light average__number'}).get_text().strip()
        score = float(score.replace(',','.'))
        
        num_reviews = soup.find('div', attrs={'class': 'default-wine-card vintage-price-id-'+str(params[0])}).\
        find('div', attrs={'class': 'text-inline-block average__stars'}).get_text().strip()
        num_reviews = int(re.findall('\d*\ ',num_reviews)[0].strip())
        
        yield title, region, country, score, num_reviews
        
        time.sleep(random.randint(1,3))
            

#wines = ['Solaia 2009 Tuscany, Italy']
wines = list(missed['search'])[0:50]

total = len(wines)
i=1
with open('missed_redwine_1.csv', 'w', encoding='utf-8', newline='') as csvfile:
    bottle_writer = csv.writer(csvfile)
    bottle_writer.writerow(['title','region','country','score','num_reviews']) 
    for wine in wines:
        try:
            bottle = list(*get_wines(wine))
        except:
            bottle = ['missed','missed','missed',0,0]
        bottle_writer.writerow(bottle)
        print('Finished bottle ' + str(i) + ' of ' + str(total))
        i += 1

110487056
Finished bottle 1 of 50
4235548
Finished bottle 2 of 50
1469826
Finished bottle 3 of 50
1520027
Finished bottle 4 of 50
1472069
Finished bottle 5 of 50
2087473
Finished bottle 6 of 50
1773078
Finished bottle 7 of 50
1629830
Finished bottle 8 of 50
1565338
Finished bottle 9 of 50
4943495
Finished bottle 10 of 50
1629830
Finished bottle 11 of 50
1629830
Finished bottle 12 of 50
158768842
Finished bottle 13 of 50
1469826
Finished bottle 14 of 50
1629830
Finished bottle 15 of 50
2187424
Finished bottle 16 of 50
4873858
Finished bottle 17 of 50
1487425
Finished bottle 18 of 50
67002882
Finished bottle 19 of 50
4052882
Finished bottle 20 of 50
3255632
Finished bottle 21 of 50
1469826
Finished bottle 22 of 50
1473613
Finished bottle 23 of 50
1516734
Finished bottle 24 of 50
1558662
Finished bottle 25 of 50
160101819
Finished bottle 26 of 50
9817832
Finished bottle 27 of 50
140383230
Finished bottle 28 of 50
140383230
Finished bottle 29 of 50
2587741
Finished bottle 30 of 50
24581102

In [5]:
dfv = pd.read_csv('vivino_lcbomatch_redwine.csv')

In [6]:
dfv['title'].isna().value_counts()

True     3760
False    2329
Name: title, dtype: int64

In [7]:
missed = dfl[dfv['title'].isna()]

In [8]:
missed

Unnamed: 0,name,region,size-mL,container,price,critic,critic_score,search
50,Château Troplong Mondot 2010,"Bordeaux, France",3000,bottle,1595.00,RP,98.0,"Château Troplong Mondot 2010 Bordeaux, France"
51,Macán,"Rioja, Spain",6000,bottle,1589.00,none,0.0,"Macán Rioja, Spain"
52,Solaia 2005,"Tuscany, Italy",1500,bottle,1584.00,WS,97.0,"Solaia 2005 Tuscany, Italy"
53,Único 2006,"Ribera del Duero, Spain",1500,bottle,1580.00,RP,98.0,"Único 2006 Ribera del Duero, Spain"
54,Pingus 2011,"Ribera del Duero, Spain",750,bottle,1575.00,RP,95.0,"Pingus 2011 Ribera del Duero, Spain"
...,...,...,...,...,...,...,...,...
6055,Cuvee Mythique Halloween Edition Pays Doc*,France,750,bottle,8.15,none,0.0,Cuvee Mythique Halloween Edition Pays Doc* France
6056,Cupcake Black Forest Decadent Red,"California, USA",750,bottle,8.15,none,0.0,"Cupcake Black Forest Decadent Red California, USA"
6057,Citra Sangiovese Terre Di Chieti IGP,"Abruzzo, Italy",750,bottle,8.15,none,0.0,"Citra Sangiovese Terre Di Chieti IGP Abruzzo, ..."
6058,Vinetti de Fiorini Chianti Superiore DOCG,"Tuscany, Italy",750,bottle,8.15,none,0.0,Vinetti de Fiorini Chianti Superiore DOCG Tusc...


In [14]:
wines = list(missed['search'])[13]

In [15]:
wines

'Solaia 2009 Tuscany, Italy'

In [33]:
dfm2 = pd.read_csv('missed_redwine_1.csv')
dfm2


Unnamed: 0,title,region,country,score,num_reviews
0,Château Troplong Mondot Un Amour de Mondot,Saint-Émilion,France,4.3,29
1,Benjamin de Rothschild - Vega Sicilia Macán Cl...,Rioja,Spain,4.3,9621
2,Antinori Tenuta Tignanello 'Solaia',Toscana,Italy,4.6,24845
3,Vega Sicilia Unico,Ribera del Duero,Spain,4.7,33882
4,Dominio de Pingus Flor de Pingus,Ribera del Duero,Spain,4.5,23145
5,Álvaro Palacios L'Ermita Velles Vinyes Priorat,Priorat,Spain,4.6,1730
6,Domaine de La Romanée-Conti La Tâche Grand Cru,La Tâche Grand Cru,France,4.7,7652
7,A. de Luze Château Latour Bordeaux,Bordeaux,France,4.4,18
8,Domaine de La Romanée-Conti Romanée-Saint-Viva...,Romanée-Saint-Vivant Grand Cru,France,4.7,4893
9,Château Canon Montségur Bordeaux,Bordeaux,France,3.3,77


In [23]:
tmp = ('missed','missed','missed','missed','missed')

In [24]:
list(tmp)

['missed', 'missed', 'missed', 'missed', 'missed']

In [25]:
*tmp

SyntaxError: can't use starred expression here (<ipython-input-25-ca40ad2e4bf3>, line 1)

In [26]:
*list(tmp)

SyntaxError: can't use starred expression here (<ipython-input-26-5b1e469ee5e3>, line 1)

In [27]:
print(*tmp)

missed missed missed missed missed


In [28]:
print(*list(tmp))

missed missed missed missed missed
