In [1]:
import camelot
import matplotlib
%matplotlib inline

import pandas as pd
import numpy as np

import requests
import bs4

import re
import io

import wget
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import os.path

In [3]:
'''Creating the list of bulletins from the site'''
list_html = requests.get('https://www.paho.org/hq/index.php?option=com_content&view=article&id=730:2009-measlesrubella-weekly-bulletin&Itemid=39426&lang=en', verify=False).text
list_soup = bs4.BeautifulSoup(list_html,'lxml')



In [5]:
'''Three match conditions for the list of links on the site'''
match_2 = re.compile('Measles Rubella Weekly Bulletin \(([0-9]+-[0-9]+)\) -\s+(\d+) (\w+) (\d+)$')
match = re.compile('Measles.Rubella Weekly Bulletin \(([0-9]+)\) -\s+(\d+) (\w+) (\d+)$')
match_space = re.compile('Measles.Rubella Weekly Bulletin \(([0-9]+)\) -\s+(\d+) (\w+) (\d+) $')

In [7]:
'''Create a list of dictionaries representing bulletins, in order of recent to older,
each with information containing url, issue, day, month, year'''
report_list = []

for url in list_soup.find_all('a'):
    url_match = match_space.match(url.text) or match_2.match(url.text) or match.match(url.text)
    
    if url_match is not None:
        report_list.append(
            {
                'link': url.text,
                'url': url.get('href'),
                'issue': url_match.group(1),
                'day': url_match.group(2),
                'month': url_match.group(3),
                'year': url_match.group(4),               
            }
        )

In [8]:
'''Download the report if it does not exist in directory; directory with pdfs should be named paho_pdfs'''
for report in report_list:
    if int(report['year']) >= 2011:
        url = 'https://www.paho.org'+report['url']
        filename = f"Issue({report['issue']})-{report['month']} {report['day']}, {report['year']}.pdf"
        filepath = 'paho_pdfs/' + filename
        if not os.path.exists(filepath):
            wget.download(url, filepath)

In [2]:
'''Set column names used for read_pdf, different formattings and setups'''
cols = ['Subregion and Country', 'Susp. Cases 2019', 'Measles Confirmed 2019 Clin.', 'Measles Confirmed 2019 Lab.',
        'Measles Confirmed 2019 Total', 'Year/Week Last Conf. Measles case', 'Rubella Confirmed 2019 Clin.',
        'Rubella Confirmed 2019 Lab.', 'Rubella Confirmed 2019 Total', 'Year/Week Last Conf. Rubella Case',
        'Diagnosis of Discarded Cases 2019 Dengue', 'Diagnosis of Discarded Cases 2019 Others',
        'Congenital Rubella Syndrome Susp.', 'Congenital Rubella Syndrome Conf.', 'Congenital Rubella Syndrome CRI*',
        'Year/Week Last Conf. CRS Case']

cols_w_epi = ['Subregion and Country', 'Susp. Cases 2019', 'Measles Confirmed 2019 Clin.',
        'Measles Confirmed 2019 Lab.', 'Measles Confirmed 2019 EPI', 'Measles Confirmed 2019 Total',
        'Year/Week Last Conf. Measles case', 'Rubella Confirmed 2019 Clin.', 'Rubella Confirmed 2019 Lab.',
        'Rubella Confirmed 2019 Total', 'Year/Week Last Conf. Rubella Case',
        'Diagnosis of Discarded Cases 2019 Dengue', 'Diagnosis of Discarded Cases 2019 Others',
        'Congenital Rubella Syndrome Susp.', 'Congenital Rubella Syndrome Conf.', 'Congenital Rubella Syndrome CRI*',
        'Year/Week Last Conf. CRS Case']

cols_extra = ['Subregion and Country', 'Susp. Cases 2019', 'Measles Confirmed 2019 Clin.',
        'Measles Confirmed 2019 Lab.', 'Measles Confirmed 2019 Total', 'Year/Week Last Conf. Measles case',
        'Rubella Confirmed 2019 Clin.', 'Rubella Confirmed 2019 Lab.', 'Rubella Confirmed 2019 Total',
        'Year/Week Last Conf. Rubella Case', 'Diagnosis of Discarded Cases 2019 Dengue', 'Remove',
        'Diagnosis of Discarded Cases 2019 Others', 'Congenital Rubella Syndrome Susp.',
        'Congenital Rubella Syndrome Conf.', 'Congenital Rubella Syndrome CRI*', 'Year/Week Last Conf. CRS Case']

new_cols = ['Subregion', 'Country', 'Susp. Cases 2019', 'Measles Confirmed 2019 Clin.',
        'Measles Confirmed 2019 Lab.', 'Measles Confirmed 2019 Total', 'Year/Week Last Conf. Measles case',
        'Rubella Confirmed 2019 Clin.', 'Rubella Confirmed 2019 Lab.', 'Rubella Confirmed 2019 Total',
        'Year/Week Last Conf. Rubella Case', 'Diagnosis of Discarded Cases 2019 Dengue',
        'Diagnosis of Discarded Cases 2019 Others', 'Congenital Rubella Syndrome Susp.',
        'Congenital Rubella Syndrome Conf.', 'Congenital Rubella Syndrome CRI*', 'Year/Week Last Conf. CRS Case']

cols_no_crs = ['Subregion and Country', 'Susp. Cases 2019', 'Measles Confirmed 2019 Clin.', 'Measles Confirmed 2019 Lab.',
        'Measles Confirmed 2019 Total', 'Year/Week Last Conf. Measles case', 'Rubella Confirmed 2019 Clin.',
        'Rubella Confirmed 2019 Lab.', 'Rubella Confirmed 2019 Total', 'Year/Week Last Conf. Rubella Case',
        'Diagnosis of Discarded Cases 2019 Dengue', 'Diagnosis of Discarded Cases 2019 Others',
        'Congenital Rubella Syndrome Susp.', 'Congenital Rubella Syndrome Conf.', 'Congenital Rubella Syndrome CRI*']

In [9]:
'''Extract the data we want from the PDFs
This is the part that has all the tinkering: changing camelot.read_pdf values
Commented below are things that need to be fixed if this script is used for other countries'''
def read_pdf(file):
    try:
        filepath = 'paho_pdfs/' + file
        
        tables = camelot.read_pdf(filepath, flavor='stream', edge_tol=500, row_tol=9, strip_text='\n', table_areas=['10, 360, 600, 90'])
        tables_low = camelot.read_pdf(filepath, flavor='stream', edge_tol=500, row_tol=9, strip_text='\n', table_areas=['10, 330, 600, 40'])
        tables_wide = camelot.read_pdf(filepath, flavor='stream', edge_tol=500, row_tol=7, strip_text='\n', table_areas=['30, 340, 620, 50'])
        
        #print(camelot.plot(tables_wide[0], kind='contour'))
        
        #TODO sometimes the table is in between tables and tables_low; it has 18 columns so it makes tables_low
        #the final_table, but then it cuts off the first few rows of the table
        #is ok for venezuela, but if expanding in the future will need to fix this
        
        #TODO sometimes it recognizes two lines as one and merges them so the values don't work anymore;
        #don't know how to fix this becuase has to do with how the table is being read not where it's being read
        
        table = tables[0].df
        table_low = tables_low[0].df
        table_wide = tables_wide[0].df
        final_table = None
        
        epi_flag = False
        
        
        if ('VEN' not in table_low.loc[:,0]) and ('VEN' not in table.loc[:,0]):
            final_table = table_wide
        else:
            if table.shape[1] == 17:
                final_table = table
            elif table.shape[1] == 16:
                final_table = table
            else:
                final_table = table_low
 
        #print(final_table.head(8))
        
        sep_flag = False
        
        try:
            data_start = final_table[final_table.loc[:, 0] == 'VEN'].index[0]
            data_end = final_table[final_table.loc[:, 0] == 'TOTAL'].index[0]
        except:
            sep_flag = True
            final_table.columns = new_cols
            data_start = final_table[final_table.loc[:, 1] == 'VEN'].index[0]
            data_end = final_table[final_table.loc[:, 1] == 'URY'].index[0]
        #^ the lines above will need to be changed when figure out how to not cut off top few rows
#         data_start = final_table[final_table.loc[:, 'Subregion and Country'] == 'AND BOL'].index[0]
        
        
        data = final_table.loc[data_start:data_end,:]
        
        if not (final_table.loc[11,11].isdigit() or final_table.loc[11,10] == '…'):
            data.columns = cols_extra
        elif final_table.shape[1] == 17:
            epi_flag = True
            data.columns = cols_w_epi
        elif final_table.shape[1] == 16:
            data.columns = cols
        elif final_table.shape[1] == 15:
            data.columns = cols_no_crs
        
        
        if not sep_flag:
            if epi_flag:
                ven_df = data.loc[(data['Subregion and Country'] == 'VEN'), ['Susp. Cases 2019', 'Measles Confirmed 2019 Clin.',
                                                            'Measles Confirmed 2019 Lab.', 'Measles Confirmed 2019 EPI',
                                                            'Measles Confirmed 2019 Total']]

            else:
                 ven_df = data.loc[(data['Subregion and Country'] == 'VEN'), ['Susp. Cases 2019', 'Measles Confirmed 2019 Clin.',
                                                            'Measles Confirmed 2019 Lab.', 'Measles Confirmed 2019 Total']]           
        elif sep_flag:
            ven_df = data.loc[(data['Country'] == 'VEN'), ['Susp. Cases 2019', 'Measles Confirmed 2019 Clin.',
                                                            'Measles Confirmed 2019 Lab.', 'Measles Confirmed 2019 Total']]          
        
        return list(ven_df.iloc[0, :].values)

    except Exception as e:
        print(e)
        return e

In [10]:
'''Extracting the data and adding it to the list of dictionaries initialized before with total confirmed measles cases'''
base_url = 'https://www.paho.org'

for report in report_list:
    if int(report['year']) >= 2011:
        filename = f"Issue({report['issue']})-{report['month']} {report['day']}, {report['year']}.pdf"
        #report['filename'] = filename

        #report['parse_result'] = read_pdf(base_url + report['url'], filename)

        values_list = read_pdf(filename)
        
        if isinstance(values_list, list):
            #print("length: " + str(len(values_list)))
            #print("list: " + str(values_list))
            print(filename)

            report['error'] = None
            #report['suspected_cases'] = values_list[0]
            #report['measles_confirmed_clin'] = values_list[1]
            #report['measles_confirmed_lab'] = values_list[2]
            total_index = None
            if len(values_list) == 5:
                #report['measles_confirmed_epi'] = values_list[3]
                total_index = 4
            else:
                #report['measles_confirmed_epi'] = None
                total_index = 3
            
            if values_list[total_index] == '…' or values_list[total_index] == '...':
                report['measles_confirmed_total'] = '...'
            else:
                total_val = values_list[total_index]
                nums = [int(i) for i in total_val if i.isdigit()] 
                func = lambda nums: int(''.join(str(i) for i in nums))
                total_num = func(nums)
                report['measles_confirmed_total'] = total_num

        else:
            report['error'] = values_list
            report['suspected_cases'] = None
            report['measles_confirmed_clin'] = None
            report['measles_confirmed_lab'] = None
            report['measles_confirmed_epi'] = None
            report['measles_confirmed_total'] = None
    else:
        break

Issue(02)-Jannuary 11, 2020.pdf
Issue(01)-Jannuary 4, 2020.pdf
Issue(52)-December 28, 2019.pdf
Issue(50-51)-December 14, 2019.pdf
Issue(49)-December 7, 2019.pdf
Issue(48)-November 30, 2019.pdf
Issue(47)-November 23, 2019.pdf
Issue(46)-November 16, 2019.pdf
Issue(45)-November 9, 2019.pdf
Issue(44)-November 2, 2019.pdf
Issue(43)-October 26, 2019.pdf
Issue(42)-October 19, 2019.pdf
Issue(41)-October 12, 2019.pdf
Issue(40)-October 5, 2019.pdf
Issue(39)-September 28, 2019.pdf
Issue(38)-September 21, 2019.pdf
Issue(37)-September 14, 2019.pdf
Issue(36)-September 7, 2019.pdf
Issue(35)-August 31, 2019.pdf
Issue(34)-August 24, 2019.pdf
Issue(33)-August 17, 2019.pdf
Issue(32)-August 10, 2019.pdf
Issue(31)-August 3, 2019.pdf
Issue(30)-July 27, 2019.pdf
Issue(29)-July 20, 2019.pdf
Issue(28)-July 13, 2019.pdf
Issue(27)-July 6, 2019.pdf
Issue(26)-June 29, 2019.pdf
Issue(25)-June 22, 2019.pdf
Issue(24)-June 15, 2019.pdf
Issue(23)-June 8, 2019.pdf
Issue(22)-June 1, 2019.pdf
Issue(21)-May 25, 2019.pdf
Is

Issue(40)-October 4, 2014.pdf
Issue(39)-September 27, 2014.pdf
Issue(38)-September 20, 2014.pdf
Issue(37)-September 13, 2014.pdf
Issue(36)-September 6, 2014.pdf
Issue(35)-August 30, 2014.pdf
Issue(34)-August 23, 2014.pdf
Issue(33)-August 16, 2014.pdf
Issue(32)-August 9, 2014.pdf
Issue(31)-August 2, 2014.pdf
Issue(30)-July 26, 2014.pdf
Issue(29)-July 19, 2014.pdf
Issue(28)-July 12, 2014.pdf
Issue(27)-July 5, 2014.pdf
Issue(26)-June 28, 2014.pdf
Issue(25)-June 21, 2014.pdf
Issue(24)-June 14, 2014.pdf
Issue(23)-June 7, 2014.pdf
cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [1] of <class 'int'>
Issue(21)-May 24, 2014.pdf
Issue(20)-May 17, 2014.pdf
Issue(19)-May 10, 2014.pdf
Issue(18)-May 3, 2014.pdf
Issue(17)-April 26, 2014.pdf
Issue(16)-April 19, 2014.pdf
Issue(15)-April 12, 2014.pdf
Issue(14)-April 5, 2014.pdf
Issue(13)-March 29, 2014.pdf
Issue(12)-March 22, 2014.pdf
Issue(11)-March 15, 2014.pdf
Issue(10)-March 8, 2014.pdf
Issue(09)-March 1, 201

In [11]:
'''Creates a list of dictionaries for each month containing the number of confirmed measles cases in that month (cumulative)'''
report_monthly = [
        {
            'monthname': f"{report_list[0]['month']} {report_list[0]['year']}",
            'month': report_list[0]['month'],
            'year': report_list[0]['year'],
            'cumulative_total': 0
        }
                ]
a = 0

for report in report_list:
    if int(report['year']) >= 2011:
        monthname = f"{report['month']} {report['year']}"
        
        if report_monthly[a]['monthname'] == monthname:
            if type(report['measles_confirmed_total']) == int:
                report_monthly[a]['cumulative_total'] = max(report_monthly[a]['cumulative_total'], report['measles_confirmed_total'])
            else:
                report_monthly[a]['cumulative_total'] = report_monthly[a]['cumulative_total']
        else:
            a += 1
            report_monthly.append({})
            report_monthly[a]['monthname'] = monthname
            report_monthly[a]['month'] = report['month']
            report_monthly[a]['year'] = report['year']
            if type(report['measles_confirmed_total']) == int:
                report_monthly[a]['cumulative_total'] = report['measles_confirmed_total']
            else:
                report_monthly[a]['cumulative_total'] = 0
    
    else:
        break

In [12]:
'''Uses the list of dictionaries from above to calculate caseload per month'''
caseload_month = []
for i in range(len(report_monthly)):
    if report_monthly[i]['month'] == 'January' or report_monthly[i]['month'] == 'Jannuary':
        caseload_month.append(
                {
                    'month': report_monthly[i]['month'],
                    'year': report_monthly[i]['year'],
                    'caseload': report_monthly[i]['cumulative_total'],              
                }
            )
    
    else:
        caseload_month.append(
                {
                    'month': report_monthly[i]['month'],
                    'year': report_monthly[i]['year'],
                    'caseload': report_monthly[i]['cumulative_total']-report_monthly[i+1]['cumulative_total'],              
                }
            )

In [13]:
'''Creates data frames from the data for both weekly cumulative and monthly caseload'''
results_df = pd.DataFrame(report_list)
monthly_df = pd.DataFrame(caseload_month)

In [14]:
'''Export the weekly cumulative data'''
columns = ['issue', 'day', 'month', 'year', 'error', 'measles_confirmed_total']
results_df.loc[:, columns].to_csv('weekly_cumulative.csv', index=False)

In [15]:
'''Export the monthly caseload data'''
monthly_df.to_csv('monthly_caseload.csv', index=False)