# c1_extract_paragraphes_issues
From the list_meetings.csv extract all the paragraphes in the html page for each issues and create a file paragraphes.txt with it 

In [1]:
import urllib
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse
import csv
import numpy as np
import requests

In [2]:
def write_paragraphe(number, paragraphes):
    """ Write "paragraphes" into a file paragraphes+number.txt. """
    paragraphes = list(set(paragraphes))
    # generate list_p_tags.txt file
    outF = open("paragraphes"+str(number)+".txt", "w")
    for line in paragraphes:
        outF.write(str(line))
        outF.write("\n")
    outF.close()

In [3]:
def remove_unwanted_p(paragraphes):
    """ Remove all the text into paragraphes that is no needed."""
    #paragraphes = re.split(r'<p>|</p>','\n'.join(paragraphes))
    paragraphes = [p.replace('<p align="justify">','') for p in paragraphes]
    paragraphes = [p.replace('<p align="CENTER">','') for p in paragraphes] 
    paragraphes = [re.sub('<strong>.+</strong>',' ',str(p)) for p in paragraphes]
    paragraphes = [re.sub('<p align="justify" class="ENB-Body">','',p) for p in paragraphes]
    paragraphes = [re.sub('<font face="Verdana" size="2">','',p) for p in paragraphes]
    paragraphes = [re.sub(r'<font.*?>','',p) for p in paragraphes]
    paragraphes = [re.sub(r'<a.*?>','',p) for p in paragraphes]
    paragraphes = [re.sub(r'<.*?>','',p) for p in paragraphes]
    paragraphes = [p for p in paragraphes if not p.isupper()] 
    return paragraphes

In [4]:
def clean_page_to_parse(page_string):
    """ Remove comment at the beginning of the hml and also put the <html> tag in lowercase and <b> tags to be able to collect all paragraphes. """
    page_string = re.sub('<!-- WWW Designer Jeff Anderson janderson@iisd.ca --!>','',str(page_string))
    page_string = re.sub('<!-- WWW Designer Jeff Anderson janderson@iisd.ca --!>','',str(page_string))
    page_string = re.sub('<!-- WWW design Jeff Anderson janderson@iisd.ca ---!>','',str(page_string))
    page_string = re.sub("<!--.*?<html", '<html', page_string, flags=re.MULTILINE)
    #page_string = re.sub('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">','',str(page_string))

    page_string = re.sub(r'<HTML>',r'<html>',str(page_string))
    page_string = re.sub(r'</HTML>',r'</html>',str(page_string))
    page_string = re.sub(r'<b>',r'</p><p>',str(page_string))
    page_string = re.sub(r'</b>',r'</p>',str(page_string))

    return bytes(page_string,'utf-8')


In [5]:
def remove_special_char(p):
    """ Remove special char to be able to detect easier sentences. """
    s = p.replace("\r"," ")
    s = s.replace(r'\x'," ")
    s = s.replace("\n"," ")
    s = s.replace("\t"," ")
    s = s.replace("\\x"," ")
    s = s.replace("\\r"," ")
    s = s.replace("\\n"," ")
    s = s.replace("\\t"," ")
    q = re.compile('\s\s+')
    p = re.compile('\n\n+')
    s = q.sub(' ',s)
    s = p.sub(' ',s)
    s = s.split('<p>')
    return s

In [6]:
def remove_footer(paragraphes):
    """ Remove footer of the page with conditions. """
    k = len(paragraphes)
    for i in range(len(paragraphes)):
        if('IN THE CORRIDORS' in paragraphes[i] or 'THINGS TO LOOK'  in paragraphes[i] or 'This issue of' in paragraphes[i]  or 'BRIEF ANALYSIS OF' in paragraphes[i]):
            k=i
            break

    return paragraphes[:k]

In [22]:
def extract_p_tags(html_link):
    """ Extract the <p> tag from a specific html link. """
    #Parse the page 
    page = urlopen(html_link).read()
    page = clean_page_to_parse(page)
    soup = BeautifulSoup(page,'html.parser')
    
    list_tp = soup.find_all('p',recursive=False)
    if(len(list_tp) == 0):
        x = "."
        list_tp = list(soup.find_all('p'))
    #Extract all the text and remove undesired paragraphes
    list_tp2 = []
    for p in list_tp:
        list_tp2 += remove_special_char(str(p))
    list_tp = remove_footer(list_tp2)
    # Remove titles and sentences all in uppercase
    list_tp = remove_unwanted_p(list_tp)
    return list_tp

In [23]:
def extract_p_tags_45(html_link):
    """ Extract <p> tags from link inside "html_link" for Issue# <45. """
    page_link = urllib.request.urlopen(html_link).read()
    soup_link = BeautifulSoup(page_link)
    paragraphes = soup_link.findAll('a',href = re.compile('\d+'))
    list_tp = []

    for pa in paragraphes:
        # doesn't use the link of the main page (all issues)
        if(pa['href'] != '1200000e.html'):
            html_link = 'https://enb.iisd.org/vol12/'+pa['href']
            list_tp += extract_p_tags(html_link)

    return list_tp


In [24]:
#Extract from "csv_file" all the html link to be able to extract all the <p> tags
def extract_from_csv_list_issues(csv_file):
    """ Extract from "csv_file" all the html link to be able to extract all the <p> tags. """
    f = open(csv_file)
    csv_f = csv.reader(f)
    list_pt = []
    return list(csv_f)[1:]

In [25]:
def extract_paragraphes_from_issue(number):
    """ Extract from "csv_file" all the html link to be able to extract all the <p> tags from issue number. """
    list_meetings = extract_from_csv_list_issues('list_meetings.csv')
    for i in range(len(list_meetings)) :
        if(int(list_meetings[i][4])== number):
            line = list_meetings[i]
            break
    
    #Extract for 0 < Issue# < 45 
    if(number < 45 ):
        list_pt = extract_p_tags_45(line[6])

    #Extract for 66 < Issue# < 775
    else :
        request = requests.get(list_meetings[number-2][6])
        if(request.status_code == 200):
            list_pt = extract_p_tags(line[6])


    write_paragraphe(number, list_pt)
    print('Issue ',number)
    return list(set(list_pt))

In [26]:
p = extract_paragraphes_from_issue(305)


[<p>\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<font color="#FFFFFF" face="Verdana, Arial" size="-1">\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\tPDF Format</font></p>, <p>\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<font color="#FFFFFF" face="Verdana, Arial" size="-1">\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t Spanish Version</font></p>, <p>\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<font color="#FFFFFF" face="Verdana, Arial" size="-1">\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\tFrench Version</font></p>, <p>\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<font color="#FFFFFF" face="Verdana, Arial" size="-1">\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\tJapanese Version</font></p>, <p>\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<font color="#4E009B" face="Verdana, Arial" size="-1">\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<span style="text-decoration: none">\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<a href="/climate/sb24/">\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<font color="#002D00">\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t<span style="text-decoration: none">\r\n\t\t\t\t\t\t\t\t\t\t\t\t\tIISD RS<br/>\r\