In [19]:
import urllib.request
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse

In [20]:
# Download the specific page to be able to extract information from it 

page = urllib.request.urlopen('https://enb.iisd.org/enb/vol12/').read()
soup = BeautifulSoup(page)

#Global Variable

month = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}

In [21]:
# Function that extracts a date from a given string by returning a tuple of int (day,month,year)

def extract_date(sdate): 
    m = re.findall('\d{4}|\d{2}|January|February|March|April|May|June|July|August|September|October|November|December|\d{1}',sdate)
    if(len(m)==0):
        d=0
    if(len(m)==5):
        d = (int(m[0]),month[m[1]],int(m[len(m)-1]))
    if(len(m)==4):
        d = (int(m[0]),month[m[2]],int(m[len(m)-1]))
    if(len(m)==3):
        d = (int(m[0]),month[m[1]],int(m[2]))

    return d

In [22]:
# Function that extracts digit from a given string and return an int 

def extract_number(sname):
    for i in sname.split():
        
        if i.isdigit():
            return int(i)

In [23]:
# Function that helps to compose and order the list. Returns an ordered, by date, list who contains all the COPs with their attributes

def compute_list(list_string):
    list_cop = []

    for s in list_string:
        l = s.split('|')
        list_cop.append((extract_number(l[0]),extract_date(l[1]),l[2]))
        
    list_cop.sort(key=lambda a: a[0], reverse=False)

    return list_cop


In [24]:
# Function that extract the list of all the COPs from a webpage and return a list containing all the COPs with their number, date and place

def extract_list_cops(soup):
    
    # find all the different COPs (not named the same way)
    
    #Case 1 :"COP"+"\s"+"\d"+"\s"+"."+"\s"  // COP 1-9
    list = soup.find_all(string=re.compile("COP"+"\s"+"\d"+"\s"+"."+"\s"))

    #Case 2 : "COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"\d" // COP 10,23,24,25
    list += soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"\d"))
    
    #Case 3 :"COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\s"+"\d"+"."+"." // COP 11,12,13,14,15,16,21,22
    list_2 = soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\s"+"\d"+"."+"."))

    # Case 4 : "COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\d"+"\s" //COP 17,18,19
    list_3 = soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\d"+"\s"))
    
    # Clean the lists to have all the same structure
    # Clean list_2
    for i in range(len(list_2)) :
        list_2[i] =  re.sub("- CMP"+"\s"+"."+".", '', list_2[i])

    # Clean list_3
    for i in range(len(list_3)) :
        list_3[i] =  re.sub("- CMP"+".", '', list_3[i])
    
    #combine all the lists
    list += list_2
    list += list_3
    
    return compute_list(list)


In [25]:
# Function that extract all the issues for each COPs and return a list in which for each COPs we have the issues corresponding with their number  date and html link 

def extract_details_cops(soup):
    
    detail_COPs = []
    
    for row in soup.find_all("tr"):

        for col in row.find_all('td'):
        
            if("Issue" in col.string):
                a = row.find_previous_sibling('tr')
                b= a.find_next('th')
                detail = []
            
                if("</h3>COP" in str(b) and not("BIS" in str(b))):
                    date_td = col.find_next_sibling('td')
                
                    while("Summary" != date_td.string and "Summary and Analysis" != date_td.string):
                        ## extract issue number
                        issue = int(re.findall('\d+',col.string)[0])
                        
                        pdf_td = date_td.find_next_sibling('td')
                        ## extract html link
                        html_td = pdf_td.find_next_sibling('td')
                        html = html_td.find('a',href=True)['href']
                    
                        ## extract date
                        date = extract_date(date_td.string)    
                        
                        ##prepare for the next issue 
                        col = pdf_td.find_next('tr').find_next('td')
                        date_td = col.find_next_sibling('td')
                        
                        ##add the issue into the list
                        detail.append((issue,date,html))

                    detail_COPs.append(detail)
                    
    return detail_COPs

In [26]:
# Show how extract_list_COPS works

list_cops = extract_list_cops(soup)

for cops_n in list_cops:
    print(cops_n)

(1, (28, 3, 1995), ' Berlin, Germany')
(2, (8, 7, 1996), ' Geneva, Switzerland')
(3, (1, 12, 1997), ' Kyoto, Japan')
(4, (2, 11, 1998), ' Buenos Aires, Argentina')
(5, (25, 10, 1999), ' Bonn, Germany')
(6, (13, 11, 2000), ' The Hague, The Netherlands')
(7, (29, 10, 2001), ' Marrakech, Morocco')
(8, (23, 10, 2002), ' New Delhi, India')
(9, (1, 12, 2003), ' Milan, Italy')
(10, (6, 12, 2004), ' Buenos Aires, Argentina')
(11, (28, 11, 2005), ' Montréal, Canada')
(12, (6, 11, 2006), ' Nairobi, Kenya')
(13, (3, 12, 2007), ' Bali, Indonesia')
(14, (1, 12, 2008), ' Poznań, Poland')
(15, (7, 12, 2009), ' Copenhagen, Denmark')
(16, (29, 11, 2010), ' Cancún, Mexico')
(17, (28, 11, 2011), ' Durban, South Africa')
(18, (26, 11, 2012), ' Doha, Qatar')
(19, (11, 11, 2013), ' Warsaw, Poland')
(21, (29, 11, 2015), ' Paris, France')
(22, (7, 11, 2016), ' Marrakech, Morocco')
(23, (6, 11, 2017), ' Bonn, Germany')
(24, (2, 12, 2018), ' Katowice, Poland')
(25, (2, 12, 2019), ' Madrid, Spain')


In [27]:
# Show how extract_details_COPs works

detail_cops = extract_details_cops(soup)
k=1
for detail_n in detail_cops:
    
    print("COP",k)
    
    for j in detail_n:
        
        print(j)
        
    k+=1


COP 1
(12, (28, 3, 1995), '/vol12/1212000e.html')
(13, (29, 3, 1995), '/vol12/1213000e.html')
(14, (30, 3, 1995), '/vol12/1214000e.html')
(15, (31, 3, 1995), '/vol12/1215000e.html')
(16, (3, 4, 1995), '/vol12/1216000e.html')
(17, (4, 4, 1995), '/vol12/1217000e.html')
(18, (5, 4, 1995), '/vol12/1218000e.html')
(19, (6, 4, 1995), '/vol12/1219000e.html')
(20, (7, 4, 1995), '/vol12/1220000e.html')
COP 2
(28, (8, 7, 1996), '/vol12/1228000e.html')
(29, (9, 7, 1996), '/vol12/1229000e.html')
(30, (10, 7, 1996), '/vol12/1230000e.html')
(31, (11, 7, 1996), '/vol12/1231000e.html')
(32, (12, 7, 1996), '/vol12/1232000e.html')
(33, (15, 7, 1996), '/vol12/1233000e.html')
(34, (16, 7, 1996), '/vol12/1234000e.html')
(35, (17, 7, 1996), '/vol12/1235000e.html')
(36, (18, 7, 1996), '/vol12/1236000e.html')
(37, (19, 7, 1996), '/vol12/1237000e.html')
COP 3
(67, (1, 12, 1997), '/vol12/enb1267e.html')
(68, (2, 12, 1997), '/vol12/enb1268e.html')
(69, (3, 12, 1997), '/vol12/enb1269e.html')
(70, (4, 12, 1997), '