# Read all the datasets created in all algorithms

In [None]:
#Read these if you want to skip running the algorithms
df_vinmonopolet = pd.read_csv("wines_vinmonopolet.csv")

### This notebook contains:

1. Scraping of Vinmonopolet
2. Algorithm to find arbitrage opportunities, using the scraped dataset and the dataset from Vivino

# Scraping Vinmonopolet



The code in this part is created to scrape the website: www.vinmonopolet.no and use this information combined with the Vivino dataset to find arbitrage opportunities.<br>
It is intended to scrape wine in the premium sector, where premium wine is defined in this notebook as a wine with a price of 60£ or more.

### Problems during scraping:

- The primary problem is language, which introduces two problems. Translated words and special characters. Special characters in terms of letters not included in the UK alphabet. 
- The second problem is that Vinmonopolet includes the producer in its whole wine name, so we were not able to separate the producer and wine name.

### Scraping solutions:
#### Language problem
- Created a .csv file with all translations from Norwegian to English. Making necessary string formatting in the algorithm. <br>

#### Special characters

- Replacing special charcters using the unicode-table. Some of the characters on Vinmonopolet were implemented using unicode. We therefore had to translate the unicode characters to plain text. We did this using the .replace function, replacing the unicode with plain text.

## Steps of scraping:
#### Pre - Algorithm
1. Opening the webpage in a new window
2. Setup
    - Dictionary to store values
    - Variables to use in the algorithm
    - Default values in case we don't find information on the variables

#### Algorithm
3. Finding the name stored in a ‘div’-tag
4. Getting the year, stored in the name variable
    - Looping over the name
    - If we find a numeric value with length = 4, we set the year value to the numeric value
    - We continue to iterate as some wine names include two years, and it is always the last occurrence that is the correct input
5. Finding the category stored in a ‘div’-tag
6. Finding the country and region
    - Splitting the variable, checking whether we have both country and region or just country. The country is always included.
7. Finding the price stored in a ‘span’-tag
    - Changing the ‘,’ to a ‘.’
    - Replacing Unicode characters to plain text
    - Removing “Kr” (NOK)
    - Converting NOK to GBP
    - Changing the type to float
    - Keeping only two decimals
8. Finding the centilitres in a ‘span’-tag
9. Finding the item ID in a ‘div’-tag
10. Adding all the variables to the dictionary
11. Going to the next page, go to step 3, and repeat. If no next page exists, move to step 12.
12. Storing the dictionary to a dataframe
13. Translating the necessary variables from Norwegian words to English


#### Post - Algorithm
14. Saving the dataframe to a .csv file


In [None]:
#Importing all necessary packages
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import pandas as pd
import time
from difflib import SequenceMatcher


### Pre - algorithm

In [None]:
#Opening Chrome in a new window
driver = webdriver.Chrome()

#Going to the intended url
driver.get("https://www.vinmonopolet.no/search/?q=:relevance:price:750:price:1000:price:5000&searchType=product&currentPage=0")

In [None]:
#Creating a dictionary to store all wine values
wines = {
    "Name" : [],
    "Year" : [],
    "Price" : [],
    "Cl" : [],
    "Country" : [],
    "Category" : [],
    "Region" : [],
    "ID" : []
}

#Setting stop value for the while-loop
stop = True
tmp = 0 #Need variable to do checks on first/last page

#Setting default values in case I find no information
region_nf = "not found"
year_nf = "Non-Vintage"

## Algorithm
**BE AWARE** This algorithm will take a long time to run as it is a lot of pages to go through. Would higly reccomend using the data stored in wines_vinmonopolet.csv instead. It will be the same result.

In [None]:
#While-loop, iterating all articles on Vinmonopolet
while stop:
    
    #Finding all items on the current page
    wine_list = driver.find_elements(By.XPATH,'//li[contains(@class,"product-item")]')
    tmp += len(list(wine_list))
    
    #Iterating through the wines
    for i in range(len(wine_list)):
        
        #Getting the name stored in 'div'-tag
        name = driver.find_elements(By.XPATH, '//div[contains(@class, "product__name")]')[i].text
        
        #Getting year, which is (if present) the last part of the name
        year = name.split(" ")
        
        #Iterating through the word to find the year
        for word in year:
            if word.isnumeric():
                if len(word) == 4:
                    year = word
                    #I do not break the loop here in case we have more years in the name (it does excist)
            else:
                year = year_nf
        
        #Setting the category stored in a 'div'-tag
        category = driver.find_elements(By.XPATH, '//div[contains(@class, "product__category-name")]')[i].text.capitalize()
        
        #Getting both country and region, which is both stored in a string (in a 'div'-tag), splitting by comma-delimiter
        country_region = driver.find_elements(By.XPATH, '//div[contains(@class, "product__district")]')[i].text.split(",")
        
        #Setting region-value in case we don't find anything
        region = ""
        
        #Checking whether we have both country and region
        if len(country_region)==1:
            country = country_region[0]
        else:
            country = country_region[0]
            region = country_region[1]
        
        #Getting price, and doing necessary formatting. 
        price = driver.find_elements(By.XPATH, '//span[contains(@class, "product__price")]')[i].text
        price = price.replace(",", ".").replace(u"\xa0", u"").replace(u"\xf4", u"o").replace(u"\xf3",u"o").replace("Kr", "")
        price = float(price.replace(u" ", u"").strip())/11.5 #We divide by 11.5 as that was the NOKGBP at the time of writing
        price = "{:.2f}".format(price)
        
        #Getting the amount of litres in the bottle
        cl = driver.find_elements(By.XPATH, '//span[contains(@class, "product__amount")]')[i].text
        
        #Getting the ordering ID to make it easier for us to access the right wine at a later point (if we were to move on with this idea)
        _id = driver.find_elements(By.XPATH, '//div[contains(@class, "product__code")]')[i].text
        
        #Adding all the variables to the dictionary
        wines["Name"].append(name)
        wines["Cl"].append(cl)
        wines["Country"].append(country)
        wines["Category"].append(category)
        wines["ID"].append(_id)
        wines["Price"].append(price)
        wines["Year"].append(year)
        
        #Adding region if it is not empty, else we add the default value of "not found"
        if region == "":
            wines["Region"].append(region_nf)
        else:
            wines["Region"].append(region)
        
    #Checking whether there is a next_arrow
    next_arrows = driver.find_elements(By.XPATH,'//button[contains(@class,"pagination-button")]')
    if len(next_arrows) > 0:  
        #If there is only one arrow, it is either the first or last page
        if len(next_arrows)==1:
            #Doing this check to check whether it is the first page or not, if not, we are at the last page
            if tmp < 25:
                next_arrow = next_arrows[0]
            else:
                break
        else:
            next_arrow = next_arrows[1]
        
        #Clicking the next_arrow
        if next_arrow.is_enabled():
            next_arrow.click()
            time.sleep(3) # Need to wait for some time to load the webpage
        else:
            stop = False
    else:
        stop = False
        

#Creating a DataFrame of the dataset
df = pd.DataFrame(wines)

#Translation

#Reading the self-made dictionary especially for this assignment
translations = pd.read_csv("translations.csv")

#Dropping any NA's that could be included
translations = translations.dropna()

#Creating a list of all words that need to be translated
translated_words = [word.strip() for word in list(translations.loc[:, "Norwegian"])]

#List of all the translation words to do the translation in the loop
translation = list(translations.loc[:,"English"])

#For-loop over all wines from Vinmonopolet
for row in range(df_wine.shape[0]):
    #Doing IF-checks on the columns that need translation
    #Category
    if df.loc[row, "Category"] in translated_words:
        df.loc[row, "Category"] = translation[translated_words.index(df.loc[row, "Category"])]
    
    #Country
    if df.loc[row, "Country"] in translated_words:
        df.loc[row, "Country"] = translation[translated_words.index(df.loc[row, "Country"])]
    
    #Region
    if df.loc[row, "Region"] in translated_words:
        df.loc[row, "Region"] = translation[translated_words.index(df.loc[row, "Region"])]


### Post - Algorithm

In [None]:
#Storing the dataset to a csv-file
df.to_csv("wines_vinmonopolet.csv")
