# 1. Preparation

## Steps

**1. Preparation: Virtual environment and packages**

**1.1 install web browser driver**   
- install one of the following according to your web browser:   
    selenium driver (firefox): https://github.com/mozilla/geckodriver/releases   
    selenium driver (Chrome): https://sites.google.com/a/chromium.org/chromedriver/downloads
- extract file
- make it executable (Linux)
	`chmod +x geckodriver`
- move it to appropriate location
	`sudo mv geckodriver (appropriate location)`   
	/usr/bin   
	/usr/local/bin
- remove file and empty folders (in download folder)

**1.2 create virtual environment**
- install/update 'pip': `python3 -m pip install --user -U pip`
- install 'virtualenv' package: `python3 -m pip install --user -U virtualenv`
- create folder
- open folder in terminal
- create virtual environment: `virtualenv (environment_name)`
- activate virtual environment: `source (environment_name)/bin/activate (linux)`

**1.3 install needed packages**
- scrappers: `python3 -m pip install -U bs4 selenium requests lxml lightrdf`
- analysis: `python3 -m pip install -U jupyter matplotlib numpy pandas scipy scikit-learn pystan`
- nlp: `python3 -m pip install -U spacy nltk itertools`
- file control: `python3 -m pip install -U os shutil send2trash re glob`
- already installed: `json`

**1.4 save requirements.txt:** `pip freeze > requirements.txt`

---

# 2. Web-scrapping

## Steps

**2.1 Load modules**

In [None]:
from selenium import webdriver as wd
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.common import exceptions
import time

**2.2 Design scrapper**
- Define function `handle_stale` to handle `StaleElementReferenceException`
- Define function `handle_noclick` to handle `ElementNotInteractableException`
- The main scrapper is built as function **`bulk_download`**

In [None]:
############################################
# handle StaleElementReferenceException
############################################
def handle_stale(b, e, cn, i):
    try:
        Select(e).select_by_index( i )
    except exceptions.StaleElementReferenceException:
        e = b.find_element_by_class_name( cn )
        Select( e ).select_by_index( i )

In [None]:
############################################
# handle ElementNotInteractableException
############################################
def handle_noclick(b, xp, id_):
    try:
        b.find_element_by_xpath( xp ).click()
        b.find_element_by_id( id_ ).click()
    except exceptions.ElementNotInteractableException:
        pass
    except exceptions.NoSuchElementException:
        pass
    

In [None]:
############################################
# bulk download
############################################
def bulk_download(webpage, save_dir, driver='Firefox',
                  file_type="application/csv,text/csv,text/comma-separated-values"):

##  # defines profile and browser
    if driver=='Firefox':
        
##      # profile for autosaving (Firefox)
        profile = wd.FirefoxProfile()
        profile.set_preference("browser.download.folderList", 2)
        profile.set_preference("browser.download.manager.showWhenStarting", False)
        profile.set_preference("browser.download.dir", save_dir)
        profile.set_preference("browser.helperApps.neverAsk.saveToDisk", file_type)

##      # start webdriver
        browser = wd.Firefox(profile)

    elif driver=='Chrome':
##      # WORK IN PROGRESS
##      # profile for autosaving (Chrome)
        profile = wd.ChromeOptions()
        prefs = {'download.prompt_for_download': False,
                 'safebrowsing.enabled': False,
                 'safebrowsing.disable_download_protection': True,
                 "profile.default_content_settings.popups": 0,
                 "download.default_directory": save_dir,
                 'download.directory_upgrade': True,}
        profile.add_experimental_option('prefs', prefs)

##      # start webdriver 
        browser = wd.Chrome(profile)
        
    else:
        raise SystemExit


##  # get webpage
    browser.get(webpage)
    time.sleep(1)

##  # accept cookies
    browser.find_element_by_class_name("check").click()

##  # open dropdown historical
    browser.execute_script("window.scrollTo(0,7500)")
    browser.find_element_by_xpath("/html/body/div[2]/main/div[8]/div[1]").click()
    
##  # select years, months and days
##  # element: year
    yEl = browser.find_element_by_class_name("ui-datepicker-year")
    yOp = yEl.find_elements_by_tag_name("option")

    for y in range( len(yOp) ):
        year = 2017 + y
        handle_stale(b=browser, e=yEl, cn="ui-datepicker-year", i=y)
       
##      # element: month
        mEl = browser.find_element_by_class_name("ui-datepicker-month")
        mOp = mEl.find_elements_by_tag_name("option")

        for m in range( len(mOp) ):
            if y==0: # year 2006 starts on june = 0
                month = m + 6
            else:
                month = m + 1
            
            handle_stale(b=browser, e=mEl, cn="ui-datepicker-month", i=m)

##          # element: day of the week
            main_xpath = "/html/body/div[2]/main/div[8]/div[2]/div/div/div/table/tbody/"

            for w in range(6):
                w_day = "tr[" + str(w + 1) + "]/td[5]" # '5' means active
                w_xpath = main_xpath + w_day
                calButton = "download_calendarButton"
                handle_noclick(b=browser, xp=w_xpath, id_=calButton)
       
##              # check
                print("donwload corresponds to: %s/%s, week: %s" % (year, month, w) )
                
    browser.quit()

**2.3 RUN**   
- We are happy to run the function to download all raw files.

In [None]:
# change working path
sys.path.append('/home/jriveraespejo/Desktop/project_europa/notebooks')
file_data = '/home/jriveraespejo/Desktop/project_europa/data/'

In [None]:
# bulk download
# download all ECB files into raw folder
web1 = "https://www.ecb.europa.eu/mopo/implement/app/html/index.en.html#cspp"
save1 = file_data + 'raw'
bulk_download(webpage=web1, save_dir=save1, file_type=filet)

# 2. Web-scrapping (Alternative solution)

## Short Description

It is a rather fast solution compared with `selenium`. By extracting features from the link addresses on which ECB saves their CSPP holdings csv files, together with `requests` module visiting url, all files are automatically saved into the target directory.

In [1]:
import requests
import pandas as pd
from datetime import timedelta, date

**2.1 Acquire all Friday dates of filing**
- Acquire all Friday dates from `2017-06-23` until today. If a certain date is Friday (when ECB published purchasing information), then we add it to a list.

In [2]:
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = date(2017, 6, 23)
end_date = date(2020, 3, 29)

date_list = []

for everyday in daterange(start_date, end_date):
    if everyday.weekday() == 4:
        date_list.append(everyday.strftime("%Y%m%d"))

**2.2 Prepare URL list**
- We first define a list `CSV_URL` to store all website links that can directly access to the specific csv documents. Then we would like to declare the `directory` in which we save those csv files.

In [3]:
## make links into a list object CSV_URL
url_source = 'https://www.ecb.europa.eu/mopo/pdf/'
url_category = 'CSPPholdings_'
url_file_format = '.csv'

CSV_URL = []  # date list to iterate through
for i in range(len(date_list)):
    CSV_URL.append(url_source + url_category + date_list[i] + url_file_format)

**2.3 First time scrapping**
- Web scraping part using a loop and `request` function.

In [4]:
## save option --> change it to your directory!!
directory = r'/Users/jingpuchen/Desktop/KU Leuven/Semester4/Modern Data Analytics/project/web_scraping/csv/'

for i in range(len(CSV_URL)):
    resp = requests.get(CSV_URL[i])  # get access to csv file
    df = pd.read_csv(io.StringIO(resp.text))  # convert to text format
#    df.dropna(axis=0,inplace=True)  # drop NA values ???????
    df.to_csv(directory + date_list[i] + '.csv',
               index = False,
               header= True)  # without variable names --> add it later on

**2.4 Second time scrapping**
- 
Why start scraping again? Because since 2020-03-30, ECB applied a new naming format, causing the parser fail to read in csv files if following the old naming pattern. Therefore, even though it is no a genius way to scrap the items again, it works for me. Also notice that there is not purchasement on two fridays: `20201225,20210101`

In [13]:
start_date_1 = date(2020, 3, 30)
end_date_1 = date(2021, 4, 10)

date_list_1 = []

for everyday in daterange(start_date_1, end_date_1):
    if everyday.weekday() == 4:
        date_list_1.append(everyday.strftime("%Y%m%d"))

date_list_1 = [d for d in date_list_1 if d not in ('20201225','20210101')]

In [14]:
## make links into a list object CSV_URL
url_source_1 = 'https://www.ecb.europa.eu/mopo/pdf/'
url_category_1 = 'CSPP_PEPP_corporate_bond_holdings_'
url_file_format_1 = '.csv'

CSV_URL_1 = []  # date list to iterate through
for i in range(len(date_list_1)):
    CSV_URL_1.append(url_source_1 + url_category_1 + date_list_1[i] + url_file_format_1)

In [15]:
## save option
directory = r'/Users/jingpuchen/Desktop/KU Leuven/Semester4/Modern Data Analytics/project/web_scraping/csv/'

for i in range(len(CSV_URL_1)):
    resp_1 = requests.get(CSV_URL_1[i])  # get access to csv file
    df_1 = pd.read_csv(io.StringIO(resp_1.text))  # convert to text format
    df_1.to_csv(directory + date_list_1[i] + '.csv',
               index = False,
               header= True)

So we are done with downloading all csv files until the last release from `2020-04-09`.

---

# 3. Append 198 `.csv` items including dates

## Steps

**3.1 Load modules**

In [None]:
import os, glob, re, chardet
import pandas as pd
from statistics import mode

**3.2 Design merge function**
- Define function `clean_weird` to standardise text encoding system
- Define function `merge_csv` to perform concatenation

In [None]:
############################################
# clean_weird function
############################################
def clean_weird(vector_dirty, extra=False):

##  # ============================        
##  # ORDER MATTERS HERE
##  # ============================ 
##  # strip and to lower
    vector_clean = vector_dirty.str.strip()
    vector_clean = vector_clean.str.lower()

##  # city names at the end
    vector_clean = vector_clean.str.replace(r'(,\s\w+(\s\w+)?)$', '', regex=True)

##  # ============================
##  # remove weird symbols
##  # ============================
    vector_clean = vector_clean.str.replace(r'á','a', regex=True)
    vector_clean = vector_clean.str.replace(r'ã','a', regex=True)
    vector_clean = vector_clean.str.replace(r'ä','a', regex=True)
    vector_clean = vector_clean.str.replace(r'é','e', regex=True)
    vector_clean = vector_clean.str.replace(r'ë','e', regex=True)
    vector_clean = vector_clean.str.replace(r'É','E', regex=True)
    vector_clean = vector_clean.str.replace(r'í','i', regex=True)
    vector_clean = vector_clean.str.replace(r'ó','o', regex=True)
    vector_clean = vector_clean.str.replace(r'ö','o', regex=True)
    vector_clean = vector_clean.str.replace(r'ü','u', regex=True)
    vector_clean = vector_clean.str.replace(r'ñ','n', regex=True)
    
##  # ============================
##  # remove company designations
##  # ============================
##  # see:
##  # https://www.corporateinformation.com/Company-Extensions-Security-Identifiers.aspx
##  # https://www.nathantrust.com/insights/comprehensive-guide-to-a-designated-activity-company-dac
    if extra==True:

##      # combos: as,sl,scs,sa,sac,sau,sas,spa,sanv, etc. (with and without intermediate . or /)
        s_chars = r'(a\W?s\W?|s\W?((a|e|p|c|l)\W?)?((a|s|u)\W?)?\W?(n\W?v\W?)?(r\W?l\W?)?)$'
        vector_clean = vector_clean.str.replace(s_chars, '', regex=True)

##      # combos: nv,nvsa,bv,oyj,ltd, etc. (with and without intermediate . or /)
        s_chars = r'((n|b)\W?v\W{0,2}?(s\W?a\W?)?|o\W?y\W?j\W?|l\W?t\W?d\W?)$'
        vector_clean = vector_clean.str.replace(s_chars, '', regex=True)

##      # combos: cvba,ag,plc,dac, etc. (with and without intermediate . or /)
        s_chars = r'(c\W?v\W?b\W?a\W?|a\W?g\W?|p\W?l\W?c\W?|d\W?a\W?c\W?)$'
        vector_clean = vector_clean.str.replace(s_chars, '', regex=True)

##      # combos: ,(g)mbh, kgaa, etc. (with and without intermediate . or /)
        s_chars = r'((g\W?)?m\W?b\W?h\W?|k\W?g\W?a\W?a\W?)$'
        vector_clean = vector_clean.str.replace(s_chars, '', regex=True)

##      # specifics
        s_chars = r'(\W(sa)\s(\wt)\W(expl)\W(p)\W(g)\W(cl)\W)$'
        vector_clean = vector_clean.str.replace(s_chars, '', regex=True)
        
        s_chars = r'(\W(soc)\W(an)\W(d)\W(gest)\W(st)\W(d)\W(sec)\W)$'
        vector_clean = vector_clean.str.replace(s_chars, '', regex=True)

    vector_clean = vector_clean.str.replace(r'-',' ', regex=True)
    vector_clean = vector_clean.str.replace(r'\s{2,}',' ', regex=True)
    vector_clean = vector_clean.str.replace(r'[^\w\s]','', regex=True)
    vector_clean = vector_clean.str.strip()
    
    return(vector_clean)

In [None]:
############################################
# function merge_csv
############################################
def merge_csv(save_dir, file_dir, file_name):
##      # location
        os.chdir(file_dir)
                
##      # list files
        all_files = [i for i in glob.glob("*.csv")]

##      # regular expression for date
        regex = re.compile(r'\d+')

##      # iterating through data
        all_df = [] # to concatenate all data
        encode = [] # to save all encodings
        
        for file in all_files:
##              # check encoding of files: open first 10'000 bytes                 
                with open(file, 'rb') as rawdata:
                        encoding = chardet.detect(rawdata.read(10000))
##                print(encoding)
##                # 73% of confidence in each file
                        
                encode.append(encoding['encoding']) # to use in final file

##              # load data frame
                df = pd.read_csv(file, sep=',', encoding=encoding['encoding'])

##              # eliminating unnecessary columns
##              # some files have extra empty colums
                if df.shape[1] > 5:
                        df.drop(df.iloc[:, 5:], axis=1, inplace=True)

##              # equalizing column names
                df.columns = ['NCB','ISIN_CODE','ISSUER_NAME','MATURITY_DATE','COUPON_RATE']
                
##              # eliminating noninformative rows
                idxNum = df[ df.ISSUER_NAME.isnull() ].index
                df = df.drop(index=idxNum)

                idxNum = df.ISSUER_NAME.str.contains('(d|D)ummy')
                idxNum = idxNum.fillna(False)
                idxNum = df[ idxNum ].index
                df = df.drop(index=idxNum)
                
##              # adding file date
                df['file_date'] = regex.findall(file) * df.shape[0]

##              # merging
                all_df.append(df)
                merged_df = pd.concat(all_df, ignore_index=True, sort=True)

##      # sorting by date
        merged_df = merged_df.sort_values(by='file_date')

##      # creting column with new names
        merged_df["Name1"] = clean_weird( merged_df['ISSUER_NAME'], extra=False)
        merged_df["Name2"] = clean_weird( merged_df['ISSUER_NAME'], extra=True)
        
##      # saving data
##      # use most repeated encoding
        final_encode = mode(encode)
        full_path = '1_' + save_dir + file_name + '.csv'
        merged_df.to_csv(full_path, index=False, encoding=final_encode)
        
        print('finished')

**3.3 RUN**

In [None]:
# merge csv's
# produced files: '1_CSPPholdings_201706_2021.csv'
file2 = file_data + 'raw'
file3 = file_data + 'processed/'
name1 = "CSPPholdings_201706_2021"
merge_csv(file_dir=file2, save_dir=file3, file_name=name1)

---

# 4. Pull from PermID API and join data 

## Steps

**4.1 Create an account in https://permid.org/**

**4.2 Match entity's names and instruments (manual process)**
- Web: https://permid.org/match
- There is one file for companies' names (name file):
    - with legal designation (with_\*)
    = without legal designation (without_\*)

**4.2.1 Load modules**

In [None]:
# libraries
import pandas as pd
import json, re, chardet, string, time, requests

**4.2.2 Define function `match_format`**
- to produce the correct csv, per (name file)

In [None]:
############################################
# match_format function
############################################
def match_format(file_dir, file_name, save_dir, save_name):
    
##  # full path for file
    full_path = file_dir + file_name + '.csv'
    
##  # check encoding of files: open first 10'000 bytes                 
    with open(full_path, 'rb') as rawdata:
        encoding = chardet.detect(rawdata.read(10000))
##    print(encoding)
##    # 73% of confidence
        
##  # load data
    df = pd.read_csv(full_path, sep=',', encoding=encoding['encoding'])

##  # put info in the right format    
##  # see https://permid.org/match:
##  # 'Organization' and 'Download Template' buttons
    df = df.drop(['COUPON_RATE','ISIN_CODE','ISSUER_NAME','MATURITY_DATE','NCB','file_date'], axis = 1)
##  # drop NCB (country) as it has missmatchs
    
##  # add empty info
    df['LocalID'] = ''
    df['Standard Identifier'] = ''
    df['Country'] = ''
    df['Street'] = ''
    df['City'] = ''
    df['PostalCode'] = ''
    df['State'] = ''
    df['Website'] = ''

##  # remove duplicates and sort
    df = df.drop_duplicates(subset=['Name1','Name2'])
    df = df.sort_values(by=['Name1'])
    
##  # re-order columns
##  # names with and without company designations
    df_main = df[['Name1','Name2']]
    
##  # names with company designations
    df_with = df[['LocalID','Standard Identifier','Name1','Country','Street','City','PostalCode','State','Website']]
    df_with.rename(columns={"Name1":"Name"}, inplace=True)

##  # names without company designations    
    df_without = df[['LocalID','Standard Identifier','Name2','Country','Street','City','PostalCode','State','Website']]
    df_without.rename(columns={"Name2":"Name"}, inplace=True)
    
##  # saving data
    full_path = '2_' + save_dir + 'main_' + save_name + '.csv'
    df_main.to_csv(full_path, index=False, encoding=encoding['encoding'])

    full_path = '2_' + save_dir + 'with_' + save_name + '.csv'
    df_with.to_csv(full_path, index=False, encoding=encoding['encoding'])

    full_path = '2_' + save_dir + 'without_' + save_name + '.csv'
    df_without.to_csv(full_path, index=False, encoding=encoding['encoding'])

    print('finished')

**4.3 Join all matched information**
- Define function `match_info` to join all (name files) produced in the previous step

In [None]:
############################################
# match_info function
############################################
def match_info(file_dir, file_name, save_dir, save_name):

##  # paths for all files
    full_path1 = '2_' + file_dir + 'main_' + file_name + '.csv'
    full_path2 = '3_' + save_dir + 'with_match_' + file_name + '.csv'
    full_path3 = '3_' + save_dir + 'without_match_' + file_name + '.csv'

##  # check encoding of files: open first 10'000 bytes                 
    with open(full_path2, 'rb') as rawdata: # the most common file
        encoding = chardet.detect(rawdata.read(10000))
##    print(encoding)
##    # 73% of confidence
        
##  # load data
    df_main = pd.read_csv(full_path1, sep=',', encoding=encoding['encoding'])

    df_with = pd.read_csv(full_path2, sep=',', encoding=encoding['encoding'])
    df_with.rename(columns={"Match OpenPermID":'OpenPermID_1',
                            'Match OrgName':'OrgName_1',
                            'Match Score':'Score_1',
                            'Match Level':'Level_1'}, inplace=True)

    df_without = pd.read_csv(full_path3, sep=',', encoding=encoding['encoding'])
    df_without.rename(columns={"Match OpenPermID":'OpenPermID_2',
                               'Match OrgName':'OrgName_2',
                               'Match Score':'Score_2',
                               'Match Level':'Level_2'}, inplace=True)

##  # add columns of interest
    df_final = pd.DataFrame([])
    df_final['Name_1'] = df_main['Name1']
    df_final = df_final.join( df_with.iloc[:,1:5] )
    df_final['Name_2'] = df_main['Name2']
    df_final = df_final.join( df_without.iloc[:,1:5] )
    df_final['EqualOrgName'] = ( df_final['OrgName_1'] == df_final['OrgName_2'] )

##  # saving data
    full_path = '4_' + save_dir + 'main_match_' + save_name + '.csv'
    df_final.to_csv(full_path, index=False, encoding=encoding['encoding'])

    print('finished')

**4.4 Pull information of entities**

**4.4.1 Load modules**

In [None]:
# libraries
import pandas as pd
import numpy as np
import json, re, chardet, string, time, requests, lightrdf

from bs4 import BeautifulSoup as bs
from selenium import webdriver as wd
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.common import exceptions

**4.4.2 Define several handlers to let things done smoothly**

In [None]:
############################################
# handlers
############################################

## if the xpath is not clickable
def handle_noclick_xp(b, xp_):
    try:
        b.find_element_by_xpath( xp_ ).click()
    except exceptions.ElementNotInteractableException:
        pass
    except exceptions.NoSuchElementException:
        pass

## if the id is not clickable
def handle_noclick_id(b, id_):
    try:
        b.find_element_by_id( id_ ).click()
    except exceptions.ElementNotInteractableException:
        pass
    except exceptions.NoSuchElementException:
        pass

## test the existence of the multiple 'divs'
def handle_noclick_extra(b, xp_start, xp_end, divs):
    for div in divs:
        try:
            xp = xp_start + str(div) + xp_end
            b.find_element_by_xpath( xp ).click()
        except exceptions.ElementNotInteractableException:
            next
        except exceptions.NoSuchElementException:
            next
        else:
            break

## handle error by xpath
def handle_error_css(b, css_):
    try:
        text = b.find_element_by_css_selector(css_).text
    except exceptions.ElementNotInteractableException:
        text = 'No error'
    except exceptions.NoSuchElementException:
        text = 'No error'
    return(text)


## reload page with 'Unexpected error occurred'
## NEEDS REVIEWING
def handle_web(b, w):
##    xpath = "/html/body/div[3]/section/div/div/div/div/div[1]/div/div/h1"
    csspath = "div.About-title h1.heading-2.ng-binding"
    error_text = handle_error_css(b=b, css_=csspath)
    print(error_text)

    j = 1
    while error_text == 'Unexpected error occurred':
        j += 1
        b.get(w)
        time.sleep(10)
        error_text = handle_error_css(b=b, css_=csspath)
        print(error_text + " at " + str(j) + " try" )

**4.4.3 Define function: `match_data`** to scrap through PermID

In [None]:
############################################
# match_data function
############################################
def match_data(wm, un, pw, file_dir, file_name, save_dir, save_name,
               rows, round_=1, driver='Firefox',
               ft="application/csv,text/csv,text/comma-separated-values"):

##  # ==========================
##  # open file of pages
##  # ==========================

##  # first time of retrieving
    if round_==1:
        
##      # path for file
        full_path = file_dir + '4_main_match_' + file_name + '.csv'
    
##      # check encoding of files: open first 10'000 bytes                 
        with open(full_path, 'rb') as rawdata:
            encoding = chardet.detect(rawdata.read(10000))
##      print(encoding)
##      # 73% of confidence
        
##      # load data
        df = pd.read_csv(full_path, sep=',', encoding=encoding['encoding'])

##      # creating storage
        df["Primary Industry"] = '0'
        df["Primary Bussiness"] = '0'
        df["Primary Economic"] = '0'
        df["Domiciled"] = '0'
        df["Incorporated"] = '0'
        df["TRBC code"] = '0'
        df["Primary Industry description"] = '0'
        obs = rows

##      # to check 'nan' first
        nan_org1 = df.OpenPermID_1.isna()
        nan_org2 = df.OpenPermID_2.isna()
        nan_org = nan_org1 & nan_org2

##      # save path
        full_path = save_dir + '5_info_match_round1_' + save_name + '.csv'
        
##  # second time of retrieving
    elif round_==2:

##      # path for file
        full_path = file_dir + '5_info_match_round1_' + file_name + '.csv'
    
##      # check encoding of files: open first 10'000 bytes                 
        with open(full_path, 'rb') as rawdata:
            encoding = chardet.detect(rawdata.read(10000))
##      print(encoding)
##      # 73% of confidence
        
##      # load data
        df = pd.read_csv(full_path, sep=',', encoding=encoding['encoding'])
        print()
        
##      # check for empty values
        nan_org1 = df.OpenPermID_1.isna()
        nan_org2 = df.OpenPermID_2.isna()
        nan_org = nan_org1 & nan_org2
        obs = df[ (df["Primary Industry"] == '0') & (nan_org!=True) ].index.tolist()
        print('checking for ' + str( len(obs) ) + ' observations')

##      # save path
        full_path = save_dir + '5_info_match_round2_' + save_name + '.csv'
    
##  # no other time is allowed
    else: 
        raise SystemExit


##  # ==========================
##  # webscrapping
##  # ==========================
##  # defines profile and browser
    if driver=='Firefox':
##      # profile for autosaving (Firefox)
        pf = wd.FirefoxProfile()
        pf.set_preference("browser.download.folderList", 2)
        pf.set_preference("browser.download.manager.showWhenStarting", False)
        pf.set_preference("browser.download.dir", save_dir)
        pf.set_preference("browser.helperApps.neverAsk.saveToDisk", ft)
        pf.set_preference("browser.link.open_newwindow", 1)
        
##      # start webdriver
        browser = wd.Firefox(pf)

    elif driver=='Chrome':
##      # WORK IN PROGRESS
##      # profile for autosaving (Chrome)
        pf = wd.ChromeOptions()
        prefs = {'download.prompt_for_download': False,
                 'safebrowsing.enabled': False,
                 'safebrowsing.disable_download_protection': True,
                 "profile.default_content_settings.popups": 0,
                 "download.default_directory": save_dir,
                 'download.directory_upgrade': True,}
        pf.add_experimental_option('prefs', prefs)

##      # start webdriver 
        browser = wd.Chrome(pf)
        
    else:
        raise SystemExit

##  # enter main page
    browser.get(wm)
    time.sleep(10)
    
    xp = "/html/body/navbar/header[1]/nav/div/ul/li[2]/a"
    handle_noclick_xp(b=browser, xp_=xp)
    time.sleep(10)
    
    username = browser.find_element_by_id("AAA-AS-SI1-SE003")
    password = browser.find_element_by_id("AAA-AS-SI1-SE006")
    username.send_keys(un)
    password.send_keys(pw)
    time.sleep(1)
    browser.find_element_by_id("AAA-AS-SI1-SE014").click()
    time.sleep(10)

##  # accept cookies
    xp = "/html/body/div[2]/div/div/div[2]/a"
    browser.find_element_by_xpath(xp).click()
    time.sleep(3)

##  # run through all info available
    for i in obs: # df.shape[0]
        
##      # to check evolution
        print('start ' + str(i) + ' of ' + str(df.shape[0]) + ':',
              df.OrgName_1[i] )
        
##      # skip no info
        if not( nan_org[i] ):
            
##          # get the web (depending on the quality of match)
            if not(nan_org1[i]) and nan_org2[i]:
                web = df.OpenPermID_1[i]
            elif nan_org1[i] and not(nan_org2[i]):
                web = df.OpenPermID_2[i]
            elif df.Score_1[i] >= df.Score_2[i]:
                web = df.OpenPermID_1[i]
            else:
                web = df.OpenPermID_2[i]
                
##          # enter web
            browser.get(web)
            time.sleep(15)

##          # check for error
            handle_web(b=browser, w=web)

##          # get main info           
            web_source = browser.page_source
            soup = bs(web_source, 'html.parser')
            full_class = soup.find_all('a', class_='link ng-binding')

##          # if there is info go in                                    
            if len(full_class) > 0:
                full_info = []
                for each_class in full_class:
                    full_info.append(each_class.text)
                if len(full_info) > 5: # when public, there is more info
                    del full_info[5:]
                if len(full_info) == 4:
                    full_info.append('')

##              # click section of interest
                xp_s = "/html/body/div[3]/section/div/div[2]/div[1]/div["
                xp_e = "]/div[2]/a"
                handle_noclick_extra(b=browser,
                                     xp_start=xp_s,
                                     xp_end=xp_e,
                                     divs=range(5,8))
                time.sleep(15)

##              # get primary industry description
                web_source = browser.page_source
##                handle_web(b=browser, w=web) # not implemented yet
                soup = bs(web_source, 'html.parser')
                full_class = soup.find_all('div', class_='col-md-8 ng-binding')
##                print(full_class)

##              # if there is info go in                                  
                if len(full_class) > 0:
                    
##                  # save info
                    full_info.append(full_class[2].text) # TRBC code
                    full_info.append(full_class[3].text) # Industry Description
                    df.iloc[i, range(11,18)] = full_info
            
##                  # save data
                    df.to_csv(full_path, index=False, encoding=encoding['encoding'])
##                    print(full_info)
                    
##  # close user session
    handle_noclick_id(b=browser, id_='profile-toggle')
    handle_noclick_xp(b=browser, xp_='/html/body/navbar/header[1]/nav/div/div[2]/ul/li[3]/a')
    time.sleep(3)
    print('session closed')
            
##  # finish the page
    browser.quit()
    print('finished')

**4.5 manual identification of missing companies**

---

# 5. Manual identification of TRBC code

-  Based on EU taxonomy, TRBC code is given to sift green industries

---

# 6. Merge data of companies and EU taxonomy

- Define functions `merge_taxonomy` to perform concatenation based on first 6 digits of TRBC code

In [None]:
############################################
# function merge_taxonomy
############################################
def merge_taxonomy(file_match, file_taxonomy, file_main, save_dir):

##  #-----------------------
##  # taxonomy data files
##  #-----------------------
##  # path for file
    full_path = file_taxonomy + 'sustainable-taxonomy_renewable.csv'
        
##  # load data
    df_green = pd.read_csv(full_path, sep=',')
    df_green['TRBC_6'] = df_green['TRBC_6'].astype(float).astype(str)
    # print( df_green )
    
##  #-----------------------
##  # match data files
##  #-----------------------
##  # path for file
    full_path1 = file_match + '5_info_match_round2_complete.csv'
    full_path2 = file_match + '6_info_match_round2_missing.csv'
        
##  # load data
    df_comp = pd.read_csv(full_path1, sep=',')
    df_miss = pd.read_csv(full_path2, sep=',')
##    print(df_comp)
##    print(df_miss)

##  # append data
    df_comp = df_comp.append(df_miss)
    df_comp['TRBC code'] = df_comp['TRBC code'].astype(float).astype(str)
##    print(df_comp)

##  # create 'green' economy column
    df_comp = df_comp.assign( green=df_comp['TRBC code'].isin(df_green.TRBC_6).astype(int) )
##    print( sum(df_comp.green) )

##  # save dataFrame
    full_path = save_dir + '7_info_match_round3_complete.csv'
    df_comp.to_csv(full_path, index=False)
    

##  #-----------------------
##  # main data files
##  #-----------------------
##  # path for file
    full_path = file_main + '1_CSPPholdings_201706_2021.csv'
        
##  # load data
    df_main = pd.read_csv(full_path, sep=',')

##  # merge 'TRBC code' and 'green' columns
    right_df = df_comp[['Name_1','Name_2','TRBC code','green']]
    right_df = right_df.rename(columns={"Name_1":"Name1", "Name_2":"Name2"})
    df_main = df_main.merge(right_df, on=['Name1','Name2'], how='left')
##    print(df_main)

##  # save dataFrame
    full_path = save_dir + '7_CSPPholdings_201706_2021.csv'
    df_main.to_csv(full_path, index=False)

    print('finished')

---

# DATA COLLECTION DONE!