# Data Collection

In [1]:
#importing libraries
import time
import requests
from selenium import webdriver
import pickle
import random
import re
import numpy as np
import pandas as pd

In [2]:
%%capture
from tqdm.notebook import tqdm

# Getting Wine Links

The urls below take you to the pages of each of the 6 types of wines: red, white, rose, sparkling, dessert, and fortified. The average price was not touched, and is set by default between 10-40 USD. The 'any rating' button, listed below the wine tabs on the left was clicked on for each of these. **Because of the site's frequent updates, this code will not reproduce the same results every time!** Each 'page' or scroll to the bottom, will get 25 wines. With the target being 1200 wines, the 10-range scroll meant for each link down below is to account for the duplicate wines that may appear as the page scrolls further down. 

In [3]:
#These are the urls I will use to find wines.
urls = ['https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWKBMMVACAJUrFHI%3D',#Red
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFsjtWKgBACVMRRz',#White
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFtjtWKgBACVNxR0',#Sparkling
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFsTtWKgBACVPRR1',#Rosé
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFtztWKgBACVTxR4',#Dessert
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFsjE7VioAwAqS8Upw%3D%3D']#Fortified
wine_links = set() #this ensures there is no repeats in links
        
driver = webdriver.Chrome()

amount = 300
for link in tqdm(urls):        
    driver.get(link)
    driver.maximize_window()

    #this will continue scrolling the page to get more wines
    for i in range(32):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.choice([x/10 for x in range(5,10)]))


    #this will get the hyperlinks to all of the wines
    wines = driver.find_elements_by_css_selector('a.anchor__anchor--2QZvA')[:-5] #-5 because the last 5 links go to their social media pages


    for index, wine in enumerate(wines):
        #this ensures there will be 1800 wines 
        if len(wine_links) == amount:
            break
        else:
            wine_links.add(wines[index].get_attribute('href'))
    amount+=300

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [4]:
wine_links = list(wine_links)

In [5]:
#there should be 1800
len(wine_links)

1800

In [6]:
driver.quit()

In [7]:
wine_links

['https://www.vivino.com/el-enemigo-chardonnay/w/1272950?year=2018&price_id=23967867',
 'https://www.vivino.com/minuty-m-rose/w/1497082?year=2020&price_id=24621776',
 'https://www.vivino.com/santi-infinito-rose/w/1556087?year=2018&price_id=21502311',
 'https://www.vivino.com/jansz-premium-rose/w/96124?price_id=2302643',
 'https://www.vivino.com/balbach-riesling/w/5365330?year=2018&price_id=22219883',
 'https://www.vivino.com/fr-domaine-de-terrebrune-terroir-du-trias-delille-vigneron-bandol-rose/w/5034472?year=2015&price_id=23898167',
 'https://www.vivino.com/domaine-de-durban-muscat-de-beaumes-de-venise/w/1108201?year=2013&price_id=9964571',
 'https://www.vivino.com/chateau-de-rayne-vigneau-clos-l-abeilley-sauternes/w/1170936?year=2013&price_id=17939101',
 'https://www.vivino.com/ermitage-du-pic-st-loup-rose/w/2232338?year=2018&price_id=21168659',
 'https://www.vivino.com/dancing-crow-vineyards-rose/w/7198985?year=2019&price_id=21505185',
 'https://www.vivino.com/hampton-water-rose/w/6

In [8]:
pickle_out = open("wine_links.pickle","wb")
pickle.dump(wine_links, pickle_out)
pickle_out.close()

# Getting Content Information

Now that we have the hyperlinks to the wines, the code below will slowly scroll down the pages of each of these links to extract the data shown in the lists below.

In [9]:
driver = webdriver.Chrome()

In [10]:
#lists for data
winery = []            #name of the winery
wine_name = []         #name of the wine
wine_type = []         #type of the wine
wine_country = []      #country that produced the wine
average_rating = []    #average rating of the wine on Vivino.com
num_of_ratings = []    #average number of ratings on Vivino.com
wine_price = []        #price of the wine on Vivino.com
review1 = []           #review 1 of wine
review2 = []           #review 2 of wine
review3 = []           #review 3 of wine
grapes = []            #the grapes utilized to make the wine
alcohol_content = []   #the alcohol content of the wine

In [11]:
for wine in tqdm(wine_links):
    driver.get(wine)
   
    
    #this will capture the information for the following lists below:
    content = driver.find_elements_by_class_name('container')[0].text
    
    
    winery.append(content.split('\n')[0] if content.split('\n')[0] else np.nan)
    wine_name.append(content.split('\n')[1])
    wine_type.append(content.split('\n')[2])
    country = re.findall('\n · \n(\w+\s?\w+)\n', content)
    wine_country.append(country[0] if country else np.nan)
    average_rating.append(re.findall('(\d+\.\d+)\\n\d+\sratings', content)[0])
    num_of_ratings.append(re.findall('\d+\.\d+\\n(\d+)\sratings', content)[0])
    
    price = re.findall('\$(\d+\.?\d+)\\nPrice is', content)
    wine_price.append(price[0] if price else np.nan)
    
    
    num = 200
    #this will scroll down the page slow enough to capture all of the review
    while num < 10000:
        try:
            driver.find_element_by_class_name('communityReviewCard__reviewCard--2ITkC')
            review = driver.find_elements_by_class_name('communityReviewCard__reviewCard--2ITkC')
            review1.append(review[0].text.split('\n')[1])
            review2.append(review[1].text.split('\n')[1])
            review3.append(review[2].text.split('\n')[1])
            time.sleep(random.choice([x/10 for x in range(10,20)]))
            break
            
        except:
            driver.execute_script(f"window.scrollTo(0, {num});")
            num += 200

            
 
    num = 200
    #this will scroll down the page slow enough to capture all of grape and alcohol content
    while num < 10000:       
        try:
            driver.find_element_by_class_name('wineFacts__wineFacts--2Ih8B')
            facts = driver.find_element_by_class_name('wineFacts__wineFacts--2Ih8B').text
            grape = re.findall('Grapes\\n(.+)\\nRegion', facts)
            grapes.append(','.join(grape) if grape else np.nan)
            alcohol = re.findall('Alcohol content\\n(\d+.?\d+)\%\\n', facts)             
            alcohol_content.append(alcohol[0] if alcohol else np.nan)
            time.sleep(random.choice([x/10 for x in range(10,20)]))
            break
            
        except:
            driver.execute_script(f"window.scrollTo(0, {num});")
            num += 200
    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1800.0), HTML(value='')))




In [16]:
reviews = [] #combined reviews
for i, n in enumerate(review1):
    reviews.append(review1[i] + ',' + review2[i] + ',' + review3[i])

In [17]:
driver.quit()

In [18]:
len(reviews)

1800

In [19]:
len(winery)

1800

In [20]:
#putting all of this into a dataframe
wine_df = pd.DataFrame({'winery': winery, 'wine_name': wine_name, 
                       'wine_type': wine_type, 'wine_country': wine_country,
                       'average_rating': average_rating, 'num_of_ratings': num_of_ratings,
                       'wine_price': wine_price, 'grapes': grapes, 'alcohol_content': alcohol_content, 'reviews': reviews})

In [21]:
wine_df.to_csv('wine_info.csv', index = False )