In [2]:
import urllib.request
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse

In [3]:
# Download the specific page to be able to extract information from it 

page = urllib.request.urlopen('https://enb.iisd.org/enb/vol12/').read()
soup = BeautifulSoup(page)

#Global Variable

month = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}

In [4]:
# Function that extracts a date from a given string by returning a tuple of int (day,month,year)

def extract_date(sdate):
    if(extract_number(sdate) == None ):
        return sdate
    
    m = re.findall('\d{4}|\d{2}|January|February|March|April|May|June|July|August|September|October|November|December|\d{1}',sdate)
    
    if(len(m)==0):
        d=0
    if(len(m)==5):
        d = (int(m[0]),month[m[1]],int(m[len(m)-1]))
    if(len(m)==4):
        d = (int(m[0]),month[m[2]],int(m[len(m)-1]))
    if(len(m)==3):
        d = (int(m[0]),month[m[1]],int(m[2]))
    
    d_str = str(d[2])+"-"+str(d[1])+"-"+str(d[0])
    
    return d_str

In [5]:
# Function that extracts digit from a given string and return an int 

def extract_number(sname):
    for i in sname.split():
        
        if i.isdigit():
            return int(i)

In [6]:
# Function that helps to compose and order the list. Returns an ordered, by date, list who contains all the COPs with their attributes

def compute_list(list_string):
    list_cop = []

    for s in list_string:
        l = s.split('|')
        list_cop.append((extract_number(l[0]),extract_date(l[1]),l[2]))
        
    list_cop.sort(key=lambda a: a[0], reverse=False)

    return list_cop


In [7]:
# Function that extract the list of all the COPs from a webpage and return a list containing all the COPs with their number, date and place

def extract_list_cops(soup):
    
    # find all the different COPs (not named the same way)
    
    #Case 1 :"COP"+"\s"+"\d"+"\s"+"."+"\s"  // COP 1-9
    list = soup.find_all(string=re.compile("COP"+"\s"+"\d"+"\s"+"."+"\s"))

    #Case 2 : "COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"\d" // COP 10,23,24,25
    list += soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"\d"))
    
    #Case 3 :"COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\s"+"\d"+"."+"." // COP 11,12,13,14,15,16,21,22
    list_2 = soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\s"+"\d+"+"\s"))

    # Case 4 : "COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\d"+"\s" //COP 17,18,19
    list_3 = soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\d+"+"\s"))
    
    # Clean the lists to have all the same structure
    # Clean list_2
    for i in range(len(list_2)) :
        list_2[i] =  re.sub("- CMP"+"\s"+"."+".", '', list_2[i])

    # Clean list_3
    for i in range(len(list_3)) :
        list_3[i] =  re.sub("- CMP"+".", '', list_3[i])
    
    #combine all the lists
    list += list_2
    list += list_3
   
    return compute_list(list)


In [30]:
# Function that extract all the issues for each COPs and return a list in which for each COPs we have the issues corresponding with their number  date and html link 

def extract_details_cops(soup):
    
    detail_COPs = []
    cop_num = 1
    for row in soup.find_all("tr"):
        for col in row.find_all('td'):
        
            #Detect a new COP 
            if("Issue" in col.string):
                
                a = row.find_previous_sibling('tr')
                b= a.find_next('th')
                detail = []
                # Variable to help to detect the first issue 
                issue_start = 0
                
                if("</h3>COP" in str(b) and not("BIS" in str(b))):
                    date_td = col.find_next_sibling('td')
                    
                    while( "<a name=" not in str(date_td.find_next('tr'))):              
                
                        #extract issue number
                        issue = int(re.findall('\d+',col.string)[0])
                        
                        # define the issue type
                        if(issue - issue_start>1):
                            issue_type = 'First'
                        else:
                            issue_type = 'Issue'
                        
                        pdf_td = date_td.find_next_sibling('td')
                        
                        # extract html link
                        html_td = pdf_td.find_next_sibling('td')
                        html = 'https://enb.iisd.org'+html_td.find('a',href=True)['href']
                    
                        # extract date
                        s = date_td.string
                        date = extract_date(s)    
                        
                        #Check if at the end of the webpage and return the final 
                        #list otherwise continue to find new issues
                        if(pdf_td.find_next('tr') == None):
                            detail.append((issue,date,html,'Summary'))
                            detail_COPs.append(detail)
                            return detail_COPs
                        else:
                            col = pdf_td.find_next('tr').find_next('td')
                            date_td = col.find_next_sibling('td')
                            issue_start = issue
                       
                        # add the issue into the list
                        detail.append((issue,date,html,issue_type))
                    
                    # Handle case when we are at the end of the COP and we have the summary
                    if( "<a name=" in str(date_td.find_next('tr'))): 
                    #extract issue number
                        issue = int(re.findall('\d+',col.string)[0])
                        
                        pdf_td = date_td.find_next_sibling('td')
                    #extract html link
                        html_td = pdf_td.find_next_sibling('td')
                        html = 'https://enb.iisd.org'+html_td.find('a',href=True)['href']
                    
                    #extract date
                        s = date_td.string
                        date = extract_date(s)
                        detail.append((issue,date,html,'Summary'))
                    
                     
                    detail_COPs.append(detail)
                    cop_num = cop_num +1
                    

  

In [35]:
def combine_information_cops(soup):
    list_cops = extract_list_cops(soup)
    list_cops_information = extract_details_cops(soup)
    total = []
    for i in range(len(list_cops)):
        (number,date,place) = list_cops[i]
        list_cop_1 = list_cops_information[i]
        
        for x in list_cop_1:
            total.append(('COP',date,place,x[0],x[1],x[2],x[3]))
    return total

In [36]:
cop = combine_information_cops(soup)

for x in cop:
    print(x)


[(12, '1995-3-28', 'https://enb.iisd.org/vol12/1212000e.html', 'First'), (13, '1995-3-29', 'https://enb.iisd.org/vol12/1213000e.html', 'Issue'), (14, '1995-3-30', 'https://enb.iisd.org/vol12/1214000e.html', 'Issue'), (15, '1995-3-31', 'https://enb.iisd.org/vol12/1215000e.html', 'Issue'), (16, '1995-4-3', 'https://enb.iisd.org/vol12/1216000e.html', 'Issue'), (17, '1995-4-4', 'https://enb.iisd.org/vol12/1217000e.html', 'Issue'), (18, '1995-4-5', 'https://enb.iisd.org/vol12/1218000e.html', 'Issue'), (19, '1995-4-6', 'https://enb.iisd.org/vol12/1219000e.html', 'Issue'), (20, '1995-4-7', 'https://enb.iisd.org/vol12/1220000e.html', 'Issue'), (21, 'Summary', 'https://enb.iisd.org/vol12/1221000e.html', 'Summary')]
('COP', '1995-3-28', ' Berlin, Germany', 12, '1995-3-28', 'https://enb.iisd.org/vol12/1212000e.html', 'First')
('COP', '1995-3-28', ' Berlin, Germany', 13, '1995-3-29', 'https://enb.iisd.org/vol12/1213000e.html', 'Issue')
('COP', '1995-3-28', ' Berlin, Germany', 14, '1995-3-30', 'htt