# Data Collection

In [1]:
#importing libraries
import time
import requests
from selenium import webdriver
import pickle
import random
import re
import numpy as np
import pandas as pd

# Getting Wine Links

The urls below take you to the pages of each of the 6 types of wines: red, white, rose, sparkling, and fortified. The average price was not touched, and is set by default between 10-40 USD. The 'any rating' button, listed below the wine tabs on the left was clicked on for each of these. **Because of the site's frequent updates, this code will not reproduce the same results every time!** Each 'page' or scroll to the bottom, will get 25 wines. With the target being 1200 wines, the 10-range scroll meant for each link down below is to account for the duplicate wines that may appear as the page scrolls further down. 

In [2]:
#These are the urls I will use to find wines.
urls = ['https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWKBMMVACAJUrFHI%3D',
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFsjtWKgBACVMRRz',
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFtjtWKgBACVNxR0',
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFsTtWKgBACVPRR1',
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFtztWKgBACVTxR4',
       'https://www.vivino.com/explore?e=eJzLLbI1VMvNzLM1NFDLTaywNTFQS660DQ1WSwYSLmoFQNn0NNuyxKLM1JLEHLX8JFu1fFu18pLoWFsjE7VioAwAqS8Upw%3D%3D']
wine_links = set() #this ensures there is no repeats in links
        
driver = webdriver.Chrome()

for link in urls:        
    driver.get(link)
    driver.maximize_window()

    #this will continue scrolling the page to get more wines
    for i in range(10): #5
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.choice([x/10 for x in range(5,10)]))


    #this will get the hyperlinks to all of the wines
    wines = driver.find_elements_by_css_selector('a.anchor__anchor--2QZvA')[:-5] #-5 because the last 5 links go to their social media pages


    for index, wine in enumerate(wines):
        #this ensures there will be 1200 wines #600
        if len(wine_links) == 1200:
            break
        else:
            wine_links.add(wines[index].get_attribute('href'))


In [3]:
driver.quit()

In [4]:
len(wine_links)

1200

In [5]:
wine_links = list(wine_links)

True

# Getting Content Information

Now that we have the hyperlinks to the wines, the code below will slowly scroll down the pages of each of these links to extract the data shown in the lists below.

In [6]:
driver = webdriver.Chrome()

In [7]:
#lists for data
winery = []            #name of the winery
wine_name = []         #name of the wine
wine_type = []         #type of the wine
wine_country = []      #country that produced the wine
average_rating = []    #average rating of the wine on Vivino.com
num_of_ratings = []    #average number of ratings on Vivino.com
wine_price = []        #price of the wine on Vivino.com
reviews = []           #3 textual reviews of the wine
grapes = []            #the grapes utilized to make the wine
alcohol_content = []   #the alcohol content of the wine

In [8]:
for wine in wine_links:
    driver.get(wine)
   
    num = 1000
    #this will scroll down the page slow enough to capture all of the data
    while num < 10000:
        driver.execute_script(f"window.scrollTo(0, {num});")
        time.sleep(random.choice([x/10 for x in range(5,10)]))
        num += 1000
    
    #this will capture the information for the following lists below:
    content = driver.find_elements_by_class_name('container')[0].text
    
    winery.append(content.split('\n')[0])
    wine_name.append(content.split('\n')[1])
    wine_type.append(content.split('\n')[2])
    country = re.findall('\n · \n(\w+\s?\w+)\n', content)
    wine_country.append(country[0] if country else np.nan)
    average_rating.append(re.findall('(\d+\.\d+)\\n\d+\sratings', content)[0])
    num_of_ratings.append(re.findall('\d+\.\d+\\n(\d+)\sratings', content)[0])
    
    price = re.findall('\$(\d+\.?\d+)\\nPrice is', content)
    wine_price.append(price[0] if price else np.nan)
    
    #this will capture the reviews, grapes, and alcohol content (if available)
    text = driver.find_element_by_class_name('inner-page').text
    
    reviews.append(','.join(re.findall('\d\.\d\\n(.*)\\n.*\)\\n', text)))
    
    grape = re.findall('Grapes\\n(.+)\\nRegion', text)
    grapes.append(','.join(grape) if grape else np.nan)
                  
    alcohol = re.findall('Alcohol content\\n(\d+.\d+)\%\\n', text)             
    alcohol_content.append(alcohol[0] if alcohol else np.nan)

In [9]:
driver.quit()

In [10]:
#putting all of this into a dataframe
wine_df = pd.DataFrame({'winery': winery, 'wine_name': wine_name, 
                       'wine_type': wine_type, 'wine_country': wine_country,
                       'average_rating': average_rating, 'num_of_ratings': num_of_ratings,
                       'wine_price': wine_price, 'grapes': grapes, 'alcohol_content': alcohol_content, 'reviews': reviews})

In [11]:
wine_df.to_csv('wine_data.csv', index = False )