# Web scraping for downloading CSV files

Import all necessary modules

In [1]:
import os
import requests
import pandas as pd
import io
import csv
import glob
from datetime import timedelta, date

Acquire all Friday dates from `2017-06-23` until today. If a certain date is Friday (when ECB published purchasing information), then we add it to a list.

In [2]:
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = date(2017, 6, 23)
end_date = date(2020, 3, 29)

date_list = []

for everyday in daterange(start_date, end_date):
    if everyday.weekday() == 4:
        date_list.append(everyday.strftime("%Y%m%d"))

We first define a list `CSV_URL` to store all website links that can directly access to the specific csv documents. Then we would like to declare the `directory` in which we save those csv files.

In [3]:
## make links into a list object CSV_URL
url_source = 'https://www.ecb.europa.eu/mopo/pdf/'
url_category = 'CSPPholdings_'
url_file_format = '.csv'

CSV_URL = []  # date list to iterate through
for i in range(len(date_list)):
    CSV_URL.append(url_source + url_category + date_list[i] + url_file_format)

Web scraping part using a loop and `request` function.

In [4]:
## save option --> change it to your directory!!
directory = r'/Users/jingpuchen/Desktop/KU Leuven/Semester4/Modern Data Analytics/project/web_scraping/csv/'

for i in range(len(CSV_URL)):
    resp = requests.get(CSV_URL[i])  # get access to csv file
    df = pd.read_csv(io.StringIO(resp.text))  # convert to text format
#    df.dropna(axis=0,inplace=True)  # drop NA values ???????
    df.to_csv (directory + date_list[i] + '.csv',
               index = False,
               header= True)  # without variable names --> add it later on

---

Why start web scraping again? Because since 2020-03-30, ECB applied a new naming format, causing the parser fail to read in csv files if following the old naming pattern. Therefore, even though it is not a genius way to scrap the items again, it works for me. Also notice that there is not purchasement on two fridays: `20201225,20210101`

In [13]:
start_date_1 = date(2020, 3, 30)
end_date_1 = date(2021, 4, 10)

date_list_1 = []

for everyday in daterange(start_date_1, end_date_1):
    if everyday.weekday() == 4:
        date_list_1.append(everyday.strftime("%Y%m%d"))

date_list_1 = [d for d in date_list_1 if d not in ('20201225','20210101')]

In [14]:
## make links into a list object CSV_URL
url_source_1 = 'https://www.ecb.europa.eu/mopo/pdf/'
url_category_1 = 'CSPP_PEPP_corporate_bond_holdings_'
url_file_format_1 = '.csv'

CSV_URL_1 = []  # date list to iterate through
for i in range(len(date_list_1)):
    CSV_URL_1.append(url_source_1 + url_category_1 + date_list_1[i] + url_file_format_1)

In [15]:
## save option
directory = r'/Users/jingpuchen/Desktop/KU Leuven/Semester4/Modern Data Analytics/project/web_scraping/csv/'

for i in range(len(CSV_URL_1)):
    resp_1 = requests.get(CSV_URL_1[i])  # get access to csv file
    df_1 = pd.read_csv(io.StringIO(resp_1.text))  # convert to text format
    df_1.to_csv (directory + date_list_1[i] + '.csv',
               index = False,
               header= True)  # without variable names --> add it later on

So we are done with downloading all csv files until the last release from `2020-04-09`.

# Concatenate files

In [16]:
## walk through all files and row combind into merged.csv
all_files = glob.glob(os.path.join(directory, "*.csv"))

all_df = []
for f in all_files:
    df = pd.read_csv(f, sep=',',encoding='latin1')
    df['file'] = f.split('/')[-1]
    all_df.append(df)

merged_df = pd.concat(all_df, ignore_index=True, sort=True)
merged_df.to_csv("merged.csv")

In [17]:
merged_df.head(10)

Unnamed: 0.1,COUPON RATE,COUPON_RATE,COUPON_RATE_,COUPON_RATE_*,ISIN,ISIN_CODE,ISSUER,ISSUER_NAME,ISSUER_NAME_,MATURITY DATE,MATURITY_DATE,MATURITY_DATE_,NCB,Unnamed: 0,Unnamed: 5,Unnamed: 6,file
0,,4.25,,,,BE0002178441,,Delhaize Group S.A.,,,19/10/2018,,BE,,,,20170818.csv
1,,3.125,,,,BE0002189554,,Delhaize Group S.A.,,,27/02/2020,,BE,,,,20170818.csv
2,,1.375,,,,BE0002239086,,Elia System Operator S.A./N.V.,,,27/05/2024,,BE,,,,20170818.csv
3,,1.0,,,,BE0002256254,,RESA SA,,,22/07/2026,,BE,,,,20170818.csv
4,,2.0,,,,BE0002269380,,Cofinimmo S.A./N.V.,,,09/12/2024,,BE,,,,20170818.csv
5,,1.375,,,,BE0002276450,,Elia System Operator S.A./N.V.,,,07/04/2027,,BE,,,,20170818.csv
6,,2.0,,,,BE0002285543,,Eandis CVBA,,,23/06/2025,,BE,,,,20170818.csv
7,,2.75,,,,BE0002420926,,Eandis CVBA,,,30/11/2022,,BE,,,,20170818.csv
8,,3.25,,,,BE0002432079,,Elia System Operator S.A./N.V.,,,04/04/2028,,BE,,,,20170818.csv
9,,3.5,,,,BE0002433085,,Elia System Operator S.A./N.V.,,,04/04/2033,,BE,,,,20170818.csv


df_1 = pd.read_csv(io.StringIO(response.text))
print(df.info())
print(df.tail(10))
print(df.shape)