# a1_compute-list-meetings
File that compute a list for each meeting type and each corresponding issues with their date, place, issue number and HTML page

In [1]:
import urllib.request
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse
import csv

In [2]:
#Dictionary of the months
month = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}

In [3]:
def extract_date(sdate):
    """ Extracts a date from a given string by returning a tuple of int (day,month,year). """
    if(extract_number(sdate) == None ):
        return sdate
    
    m = re.findall('\d{4}|\d{2}|January|February|March|April|May|June|July|August|September|October|November|December|\d{1}',sdate)

    if(len(m)==0):
        d=0
    if(len(m)==5):
        d = (int(m[0]),month[m[1]],int(m[len(m)-1]))
    if(len(m)==4):
        d = (int(m[0]),month[m[2]],int(m[len(m)-1]))
    if(len(m)==3):
        if(m[0] in month.keys()):
            d=(m[1],month[m[0]],m[2])
        else :
            d = (m[0],month[m[1]],m[2])
    
    d_str = str(d[2])+"-"+str(d[1])+"-"+str(d[0])
    
    return d_str

In [4]:
def extract_number(sname):
    """"Extract digit from a given string and return an int."""
    for i in sname.split():

        if i.isdigit():
            return int(i)
        

In [5]:
def compute_list(list_string):
    """ Help to compose and order the list. Returns an ordered, by date, list who contains all the meetings in list_string with their attributes. """
    list_cop = []

    for s in list_string:
        l = s.split('|')
        list_cop.append((extract_number(l[0]),extract_date(l[1]),l[2]))
        
    list_cop.sort(key=lambda a: str(a[0]), reverse=False)

    return list_cop


In [6]:
def extract_details_meetings(soup, meeting_type):
    """ Extract for one meeting_type, all the corresponding issues. For each issue extract the issue number, the date, the html link. """
    detail_meetings = []
    meeting_num = 1
    for row in soup.find_all("tr"):
        for col in row.find_all('td'):
        
            #Detect a new meeting 
            if("Issue" in col.string):
                
                a = row.find_previous_sibling('tr')
                b= a.find_next('th')
                detail = []
                # Variable to help to detect the first issue 
                issue_start = 0
                # and not("BIS" in str(b))
                if("</h3>"+meeting_type in str(b)):
                    
                    date_td = col.find_next_sibling('td')
                    
                    while( "<a name=" not in str(date_td.find_next('tr'))):              
                
                        #extract issue number
                        issue = int(re.findall('\d+',col.string)[0])
                        
                        # define the issue type
                        if(issue - issue_start>1):
                            issue_type = 'First'
                        else:
                            issue_type = 'Issue'
                        
                        pdf_td = date_td.find_next_sibling('td')
                        
                        # extract html link
                        html_td = pdf_td.find_next_sibling('td')
                        html = 'https://enb.iisd.org'+html_td.find('a',href=True)['href']
                    
                        # extract date
                        s = date_td.string
                        date = extract_date(s)    
                        
                        #Check if at the end of the webpage and return the final 
                        #list otherwise continue to find new issues
                        if(pdf_td.find_next('tr') == None):
                            break
                        else:
                            col = pdf_td.find_next('tr').find_next('td')
                            date_td = col.find_next_sibling('td')
                            issue_start = issue
                       
                        # add the issue into the list
                        detail.append((issue,date,html,issue_type))
                    
                    # Handle case when we are at the end of the COP and we have the summary
                    if( "<a name=" in str(date_td.find_next('tr'))): 
                    #extract issue number
                        issue = int(re.findall('\d+',col.string)[0])
                        
                        pdf_td = date_td.find_next_sibling('td')
                    #extract html link
                        html_td = pdf_td.find_next_sibling('td')
                        html = 'https://enb.iisd.org'+html_td.find('a',href=True)['href']
                    
                    #extract date
                        s = date_td.string
                        date = extract_date(s)
                        detail.append((issue,date,html,'Summary'))
                    
                    if(pdf_td.find_next('tr') == None):
                            detail.append((issue,date,html,'Summary'))
                            detail_meetings.append(detail)
                            return detail_meetings
                    
                     
                    detail_meetings.append(detail)
                    meeting_num = meeting_num +1
    return detail_meetings
                    

  

# COP 

In [7]:
def extract_list_cops(soup):
    """Extract the list of all the COPs from a webpage and return a list containing all the COPs with their number, date and place. """
    
    # find all the different COPs (not named the same way)
    
    #Case 1 :1, 2, 3, 4, 5, 6, 7, 8, 9
    list = soup.find_all(string=re.compile("COP"+"\s"+"\d"+"\s"+"."+"\s"))

    #Case 2 :10, 23, 24, 25
    list += soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"\d"))
    
    #Case 3 :11, 12, 13, 14, 15, 16, 20, 21 , 22
    list_2 = soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\s"+"\d+"+"\s"))

    # Case 4 : 17, 18, 19
    list_3 = soup.find_all(string=re.compile("COP"+"\s"+"[1-2][0-9]"+"\s"+"."+"\s"+"CMP"+"\d+"+"\s"))

    # Case 4 : BIS 
    list_4 = [soup.find_all(string=re.compile("COP"+"\s"+"\d+"+"\s"+"BIS"))[0]]
    # Clean the lists to have all the same structure
    # Clean list_2
    for i in range(len(list_2)) :
        list_2[i] =  re.sub("- CMP"+"\s"+"."+".", '', list_2[i])

    # Clean list_3
    for i in range(len(list_3)) :
        list_3[i] =  re.sub("- CMP"+".", '', list_3[i])

    # Clean list_4
    for i in range(len(list_4)) :
        list_4[i] =  re.sub(" BIS"+".2", '', list_4[i])

    #combine all the lists
    list += list_2
    list += list_3
    list += list_4
   
    return compute_list(list)

# INC 

In [8]:
def extract_list_incs(soup):
    """ Extract of all the INC meetings from a webpage and return a list containing all of them with their number, date and place """
    
    #Case 1 : 11 
    list = soup.find_all(string=re.compile("INC"+"\s"+"\d+"+"\s"))

    return compute_list(list)

# SB 

In [9]:
def extract_list_sbs(soup):
    """Extract of all the SB meetings from a webpage and return a list containing all of them with their number, date and place."""
    

    #Case 1 : 1, 3, 7, 8, 10, 12, 13, 18, 20 ,22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 48-2, 50
    list_1 = soup.find_all(string=re.compile("SB"+"\s"+"\d+"+"\s"+"."+"\s"+"\d+"))
    
    #Case 2 : 4, 6
    list_2 = soup.find_all(string=re.compile("SB"+"\s"+"\d+"+"\s"+"-"+"\s"+"AG"+"."+"."+"\s"+"\d"+"...\d+"))
    
    #Case 3 : 5
    list_3 = soup.find_all(string=re.compile("SB"+"\s"+"\d+"+"\s"+"-"+"\s"+"AG"+".."+"\s"+"\d"+"............\d"))
    
    #Case 4 : 50 
    list_4 = soup.find_all(string=re.compile("SB"+"-..."))
    
    #Case 5 : 32
    list_5 = soup.find_all(string=re.compile("SB"+"\s"+"\d+"+"\s"+"- AWG..."))
    
    # Clean list_2
    for i in range(len(list_2)) :
        list_2[i] =  re.sub("- AG\d+ \d ", '', list_2[i])
    
    for i in range(len(list_3)) :
        list_3[i] =  re.sub("- AGBM \d . AG....", '', list_3[i])
    
    for i in range(len(list_4)) :
        list_4[i] =  re.sub("-", ' ', list_4[i])
    
    for i in range(len(list_5)) :
        list_5[i] =  re.sub(" . AWGs", '', list_5[i])

    list = list_1+list_2+list_3+list_4+list_5

    return compute_list(list)

# IPCC

In [10]:
def extract_list_ipccs(soup):
    """ Extract of all the IPCC meetings from a webpage and return a list containing all of them with their number, date and place. """
    
    #Case 1 : 17, 18, 22, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52
    list = soup.find_all(string=re.compile("IPCC-\d+ . "))

    for i in range(len(list)) :
        list[i] =  re.sub("-", ' ', list[i])
        
    return compute_list(list)


# AGBM 

In [11]:
def extract_list_agbms(soup):
    """ Extract of all the AGMB meetings from a webpage and return a list containing all of them with their number, date and place. """
  
    #Case 1 : 1, 2, 3, 6, 7
    list = soup.find_all(string=re.compile("AGBM \d+ . \d+"))
        
    return compute_list(list)

# UNFCCC WS

In [12]:
def extract_list_unfcccs(soup):
    """ Extract of all the UNFCC WS meetings from a webpage and return a list containing all of them with their number, date and place. """
    
    list = soup.find_all(string=re.compile("UNFCCC WS.*?"))
    separated = []
    for l in list:
        sep = l.split("|")
        sep[0] = re.sub("UNFCCC WS", '', sep[0])  
        sep[1] = extract_date(sep[1])
        separated.append(sep)
    return separated

# ADP 


In [13]:
def extract_list_adps(soup):
    """ Extract of all the ADP meetings from a webpage and return a list containing all of them with their number, date and place. """
    #Case 1 : 2, 2-10, 2-11, 2-4, 2-6, 2-8
    list = soup.find_all(string=re.compile("ADP \d......"))
    
    separated = []
    for l in list:
        sep = l.split("|")
        sep[0] = re.sub("ADP", '', sep[0])  
        sep[1] = extract_date(sep[1])
        separated.append(sep)
    return separated

# AWG

## AWGS CCWG

In [14]:
def extract_list_awgs_t1(soup):
    """ Extract of all the AWGs CCWG meetings from a webpage and return a list containing all of them with their number, date and place. """
    
    #Case 1 : 1, 7, 9, 11, 12, 14, 16, 17i
    list = soup.find_all(string=re.compile("AWGs CCWG\d..."))
    
    separated = []
    for l in list:
        sep = l.split("|") 
        sep[0] = re.sub("AWGs CCWG", '', sep[0]) 
        sep[1] = extract_date(sep[1])
        separated.append(sep)
    return separated

## AWGS RCCWG

In [15]:
def extract_list_awgs_t2(soup):
     """ Extract of all the AWGs RCCWG meetings from a webpage and return a list containing all of them with their number, date and place. """
    
    #Case 1 : 7
    list = soup.find_all(string=re.compile("AWGs RCCWG\d ..."))
    
    separated = []
    for l in list:
        sep = l.split("|") 
        sep[0] = re.sub("AWGs RCCWG", '', sep[0])
        sep[1] = extract_date(sep[1])
        separated.append(sep)
    return separated

## AWGLCA

In [16]:
def extract_list_awgs_t3(soup):
    """ Extract of all the AWGLCA meetings from a webpage and return a list containing all of them with their number, date and place. """

    #Case 1 : 1, 2, 5
    list = soup.find_all(string=re.compile("AWGLCA \d ..."))
    
    separated = []
    for l in list:
        sep = l.split("|") 
        sep[0] = re.sub("AWGLCA", '', sep[0])
        sep[1] = extract_date(sep[1])
        separated.append(sep)
    return separated

## AWG

In [17]:
def extract_list_awgs_t4(soup):
    """ Extract of all the AWGLCA meetings from a webpage and return a list containing all of them with their number, date and place. """
    #Case 1 : 4   
    list = soup.find_all(string=re.compile("AWG-\d..."))

    separated = []
    for l in list:
        sep = l.split("|") 
        sep[0] = re.sub("AWG-", '', sep[0])
        sep[1] = extract_date(sep[1])
        separated.append(sep)

    return separated

## Tech-Work

In [None]:
def extract_list_awgs_t4(soup):
    """ Extract of all the Tech-Work meetings from a webpage and return a list containing all of them with their number, date and place. """
    #Case 1 :    
    list = soup.find_all(string=re.compile("Tech-Work |"))

    separated = []
    for l in list:
        sep = l.split("|") 
        sep[0] = re.sub("Tech-Work", '', sep[0])
        sep[1] = extract_date(sep[1])
        separated.append(sep)

    return separated