# b1_extract_all_paragraphes
From the list_meetings.csv extract all the paragraphes in the html page for each issues and create a file paragraphes.txt with it 

In [1]:
import urllib
from urllib.request import urlopen
import re
from bs4 import BeautifulSoup
from dateutil.parser import parse
import csv
import requests

In [None]:
def write_paragraphe(paragraphes):
    """ Write "paragraphes" into a file paragraphes.txt. """
    paragraphes = list(set(paragraphes))
    # generate list_p_tags.txt file
    outF = open("paragraphes"+str(number)+".txt", "w")
    for line in paragraphes:
        outF.write(str(line))
        outF.write("\n")
    outF.close()

In [2]:
def check_headlines(paragraphes):
    """ Find all the headlines in the html page and remove it from paragraphes. """
    list_tp_f = []
    for p in paragraphes:
        if('class="ENB-Headline3' not in p):
            list_tp_f.append(p)
    return list_tp_f

In [3]:
def remove_unwanted_p(paragraphes):
    """ Remove all the text into paragraphes that is no needed."""
    paragraphes = re.split(r'<p>|</p>','\n'.join(paragraphes))
    paragraphes = check_headlines(paragraphes)
    paragraphes = [p.replace('<p align="justify">','') for p in paragraphes]
    paragraphes = [p.replace('<p align="CENTER">','') for p in paragraphes]
    paragraphes = [re.sub('<strong>.+</strong>',' ',str(p)) for p in paragraphes]
    paragraphes = [re.sub(r'<.*?>','',p) for p in paragraphes]
    paragraphes = [p for p in paragraphes if not p.isupper()] 
    return paragraphes

In [4]:
def clean_page_to_parse(page_string):
    """ Remove comment at the beginning of the hml and also put the <html> tag in lowercase and <b> tags to be able to collect all paragraphes. """
    page_string = re.sub('<!-- WWW Designer Jeff Anderson janderson@iisd.ca --!>','',str(page_string))
    page_string = re.sub('<!-- WWW Designer Jeff Anderson janderson@iisd.ca --!>','',str(page_string))
    page_string = re.sub(r'<HTML>',r'<html>',str(page_string))
    page_string = re.sub(r'</HTML>',r'</html>',str(page_string))
    page_string = re.sub(r'<b>',r'</p><p>',str(page_string))
    page_string = re.sub(r'</b>',r'</p>',str(page_string))

    return bytes(page_string,'utf-8')

In [5]:
def remove_special_char(p):
    """ Remove special char to be able to detect easier sentences. """
    s = p.replace("\r"," ")
    s = s.replace(r'\x'," ")
    s = s.replace("\n"," ")
    s = s.replace("\\x"," ")
    s = s.replace("\\r"," ")
    s = s.replace("\\n"," ")
    q = re.compile('\s\s+')
    s = q.sub(' ',s)
    return s

In [6]:
def remove_footer(paragraphes):
    """ Remove footer of the page with conditions. """
    k = len(paragraphes)
    for i in range(len(paragraphes)):
        if('THINGS TO LOOK'  in paragraphes[i] or 'This issue of' in paragraphes[i] or 'IN THE CORRIDORS' in paragraphes[i] or 'BRIEF ANALYSIS OF' in paragraphes[i]):
            k=i
            break
    
    return paragraphes[:k]

In [7]:
 def extract_p_tags(html_link):
    """ Extract the <p> tag from a specific html link. """
    #Parse the page 
    page = urlopen(html_link).read()
    page = clean_page_to_parse(page)
    soup = BeautifulSoup(page,'html.parser')
    list_tp = soup.find_all('p',recursive=False)
    if(len(list_tp) == 0):
        x = "."
        list_tp = list(soup.find_all('p',recursive="False"))


    #Extract all the text and remove undesired paragraphes
    j= '.-.'
    list_tp = [remove_special_char(str(p)) for p in list_tp]
    list_tp = remove_footer(list_tp)
    # Remove titles and sentences all in uppercase
    list_tp = remove_unwanted_p(list_tp)
    return list_tp

In [8]:
def extract_p_tags_45(html_link):
    """ Extract <p> tags from link inside "html_link" for Issue# <45. """
    page_link = urllib.request.urlopen(html_link).read()
    soup_link = BeautifulSoup(page_link)
    paragraphes = soup_link.findAll('a',href = re.compile('\d+'))
    list_tp = []

    for pa in paragraphes:
        # doesn't use the link of the main page (all issues)
        if(pa['href'] != '1200000e.html'):
            html_link = 'https://enb.iisd.org/vol12/'+pa['href']
            list_tp += extract_p_tags(html_link)

    return list_tp

In [9]:
def extract_from_csv_list_issues(csv_file):
    """ Extract from "csv_file" all the html link to be able to extract all the <p> tags. """
    f = open(csv_file)
    csv_f = csv.reader(f)
    list_pt = []
    return list(csv_f)[1:]

In [10]:
def write_list_pargraphes(list_paragraphes):
    """ Generate list_p_tags.txt file. """
    outF = open("paragraphes.txt", "w")
    for line in list_paragraphes:
        outF.write(str(line))
        outF.write("\n")
    outF.close()

In [11]:
def extract_from_csv_p_tags(csv_file):
    """ Extract from "csv_file" all the html link to be able to extract all the <p> tags. """
    list_meetings = extract_from_csv_list_issues(csv_file)
    list_pt = []
    for i, row in enumerate(list_meetings):

        #Extract for 0 < Issue# < 45 
        if(int(row[4]) < 45 ):
            list_pt += extract_p_tags_45(row[6])
    
        #Extract for 66 < Issue# < 775
        else :
            request = requests.get(row[6])
            if(request.status_code == 200):
                list_pt += extract_p_tags(row[6])

        print(f'{(i+1)/len(list_meetings)*100:.2f}%', end='\r')
    write_list_pargraphes(list_pt)

In [12]:
extract_from_csv_p_tags('list_meetings.csv')

0.14%0.27%0.41%0.54%0.68%0.82%0.95%1.09%1.23%1.36%1.50%1.63%1.77%1.91%2.04%2.18%2.32%2.45%2.59%2.72%2.86%3.00%3.13%3.27%3.41%3.54%3.68%3.81%3.95%4.09%4.22%4.36%4.50%4.63%4.77%4.90%5.04%5.18%5.31%5.45%5.59%5.72%5.86%5.99%6.13%6.27%6.40%6.54%6.68%6.81%6.95%7.08%7.22%7.36%7.49%7.63%7.77%7.90%8.04%8.17%8.31%8.45%8.58%8.72%8.86%8.99%9.13%9.26%9.40%9.54%9.67%9.81%9.95%10.08%10.22%10.35%10.49%10.63%10.76%10.90%11.04%11.17%11.31%11.44%11.58%11.72%11.85%11.99%12.13%12.26%12.40%12.53%12.67%12.81%12.94%13.08%13.22%13.35%13.49%13.62%13.76%13.90%14.03%14.17%14.31%14.44%14.58%14.71%14.85%14.99%15.12%15.26%15.40%15.53%15.67%15.80%15.94%16.08%16.21%16.35%16.49%16.62%16.76%16.89%17.03%17.17%17.30%17.44%17.57%17.71%17.85%17.98%18.12%18.26%18.39%18.53%18.66%18.80%18.94%19.07%19.21%19.35%19.48%19.62%19.75%19.89%20.03%20.16%20.30%20.44%20.57%20.71%20.84%20.98%21.12%21.25%21.39%21.53%21.66%21.80%21.93%22.07%22.21%22.34%22.48%22.62%22.75%22.89%23.02%23.16%23.30%23.43%23.57%23.71%23.84%23.98%24.11%24.25%24.39