### PDF Table Scraping 

**For research into the relationship between various socio-economic factors and countries' democracy scores over time**

**By: Zach Palmer**

In [2]:
# import in packages
import pandas as pd
import numpy as np
import camelot as cl

import matplotlib.pyplot as plt
%matplotlib inline

### 2007 Democracy Index

In [2]:
# read the pdf and scrape the tables
tables = cl.read_pdf('pdfs/democracy-index_2007.pdf', pages='3,4,5', flavor='stream', table_areas=['0,650,500,0'])

In [4]:
# concatenate the tables together into one big dataframe
merged_df = tables[0].df
for i in range(1, len(tables)):
    merged_df = pd.concat([merged_df, tables[i].df], ignore_index=True)

# update the column headers
merged_df.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [None]:
merged_df

### 2008 Democracy Index

**Need to fix countries whose names take up multiple rows**

In [7]:
# read the pdf and scrape the tables
first_table_08 = cl.read_pdf('pdfs/democracy-index_2008.pdf', pages='4', flavor='stream', table_areas=['0,560,550,0'])
full_tables_08 = cl.read_pdf('pdfs/democracy-index_2008.pdf', pages='5,6,7', flavor='stream', table_areas=['0,700,550,0'])
last_table_08 = cl.read_pdf('pdfs/democracy-index_2008.pdf', pages='8', flavor='stream', table_areas=['0,700,550,420'])

### 2010 Democracy Index

In [9]:
full_tables_10 = cl.read_pdf('pdfs/democracy-index_2010.pdf', pages='4,5,6,7,8', flavor='stream', table_areas=['0,560,550,50'])
last_table_10 = cl.read_pdf('pdfs/democracy-index_2010.pdf', pages='9', flavor='stream', table_areas=['0,580,550,510'])

In [10]:
# concatenate the tables together into one big dataframe
merged_df_10 = full_tables_10[0].df
for i in range(1, len(full_tables_10)):
    merged_df_10 = pd.concat([merged_df_10, full_tables_10[i].df], ignore_index=True)

# add the last table
merged_df_10 = pd.concat([merged_df_10, last_table_10[0].df], ignore_index=True)

# update the column headers
merged_df_10.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [None]:
merged_df_10

### 2011 Democracy Index

In [31]:
# read the pdf and scrape the tables
first_table_11 = cl.read_pdf('pdfs/democracy-index_2011.pdf', pages='4', flavor='stream', table_areas=['0,230,550,50'])
full_tables_11 = cl.read_pdf('pdfs/democracy-index_2011.pdf', pages='5,6,7,8', flavor='stream', table_areas=['0,570,550,50'])
last_table_11 = cl.read_pdf('pdfs/democracy-index_2011.pdf', pages='9', flavor='stream', table_areas=['0,570,550,200'])

In [36]:
# concatenate the tables together into one big dataframe
merged_df_11 = first_table_11[0].df
for i in range(0, len(full_tables_11)):
    merged_df_11 = pd.concat([merged_df_11, full_tables_11[i].df], ignore_index=True)

# add the last table
merged_df_11 = pd.concat([merged_df_11, last_table_11[0].df], ignore_index=True)

# update the column headers
merged_df_11.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [None]:
merged_df_11

### 2012 Democracy Index

In [50]:
# read the pdf and scrape the tables
first_table_12 = cl.read_pdf('pdfs/democracy-index_2012.pdf', pages='4', flavor='stream', table_areas=['0,360,550,50'])
full_tables_12 = cl.read_pdf('pdfs/democracy-index_2012.pdf', pages='5,6,7,8', flavor='stream', table_areas=['0,560,550,50'])
last_table_12 = cl.read_pdf('pdfs/democracy-index_2012.pdf', pages='9', flavor='stream', table_areas=['0,560,550,260'])

In [57]:
# concatenate the tables together into one big dataframe
merged_df_12 = first_table_12[0].df
for i in range(0, len(full_tables_12)):
    merged_df_12 = pd.concat([merged_df_12, full_tables_12[i].df], ignore_index=True)

# add the last table
merged_df_12 = pd.concat([merged_df_12, last_table_12[0].df], ignore_index=True)

# update the column headers
merged_df_12.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [None]:
merged_df_12

### 2013 Democracy Index

In [67]:
# read the pdf and scrape the tables
first_table_13 = cl.read_pdf('pdfs/democracy-index_2013.pdf', pages='4', flavor='stream', table_areas=['0,420,550,50'])
full_tables_13 = cl.read_pdf('pdfs/democracy-index_2013.pdf', pages='5,6,7,8', flavor='stream', table_areas=['0,580,550,50'])
last_table_13 = cl.read_pdf('pdfs/democracy-index_2013.pdf', pages='9', flavor='stream', table_areas=['0,580,550,400'])

In [77]:
# concatenate the tables together into one big dataframe
merged_df_13 = first_table_13[0].df
for i in range(0, len(full_tables_13)):
    merged_df_13 = pd.concat([merged_df_13, full_tables_13[i].df], ignore_index=True)

# add the last table
merged_df_13 = pd.concat([merged_df_13, last_table_13[0].df], ignore_index=True)

# update the column headers
merged_df_13.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [None]:
merged_df_13

### 2014 Democracy Score

In [92]:
first_table_14 = cl.read_pdf('pdfs/democracy-index_2014.pdf', pages='4', flavor='stream', table_areas=['0,140,550,50'])
full_tables_14 = cl.read_pdf('pdfs/democracy-index_2014.pdf', pages='5,6,7,8,9', flavor='stream', table_areas=['0,570,550,70'])

In [99]:
# concatenate the tables together into one big dataframe
merged_df_14 = first_table_14[0].df
for i in range(0, len(full_tables_14)):
    merged_df_14 = pd.concat([merged_df_14, full_tables_14[i].df], ignore_index=True)

# update the column headers
merged_df_14.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [None]:
merged_df_14

### 2015 Democracy Score

In [120]:
full_tables_15 = cl.read_pdf('pdfs/democracy-index_2015.pdf', pages='5,6,7,8', flavor='stream', table_areas=['0,580,550,70'])
last_table_15 = cl.read_pdf('pdfs/democracy-index_2015.pdf', pages='9', flavor='stream', table_areas=['0,580,550,150'])

In [127]:
# concatenate the tables together into one big dataframe
merged_df_15 = full_tables_15[0].df
for i in range(1, len(full_tables_15)):
    merged_df_15 = pd.concat([merged_df_15, full_tables_15[i].df], ignore_index=True)

# add the last table
merged_df_15 = pd.concat([merged_df_15, last_table_15[0].df], ignore_index=True)

# update the column headers
merged_df_15.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [None]:
merged_df_15

### 2016 Democracy Index

In [None]:
full_tables_16 = cl.read_pdf('pdfs/democracy-index_2016.pdf', pages='9,10,11,12', flavor='stream', table_areas=['0,580,550,70'])
last_table_16 = cl.read_pdf('pdfs/democracy-index_2016.pdf', pages='13', flavor='stream', table_areas=['0,580,550,190'])

In [141]:
# concatenate the tables together into one big dataframe
merged_df_16 = full_tables_16[0].df
for i in range(1, len(full_tables_16)):
    merged_df_16 = pd.concat([merged_df_16, full_tables_16[i].df], ignore_index=True)

# add the last table
merged_df_16 = pd.concat([merged_df_16, last_table_16[0].df], ignore_index=True)

# update the column headers
merged_df_16.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [None]:
merged_df_16

### 2017 Democracy Scores

In [None]:
full_tables_17 = cl.read_pdf('pdfs/democracy-index_2017.pdf', pages='7,8,9,10', flavor='stream', table_areas=['0,620,550,70'])
last_table_17 = cl.read_pdf('pdfs/democracy-index_2017.pdf', pages='11', flavor='stream', table_areas=['0,610,550,350'])

In [15]:
# concatenate the tables together into one big dataframe
merged_df_17 = full_tables_17[0].df
for i in range(1, len(full_tables_17)):
    merged_df_17 = pd.concat([merged_df_17, full_tables_17[i].df], ignore_index=True)

# add the last table
merged_df_17 = pd.concat([merged_df_17, last_table_17[0].df], ignore_index=True)

# update the column headers
merged_df_17.columns = [
    'Country', 'Rank', 'Overall Score', 
    'Electoral Process and Pluralism', 
    'Functioning of Government',
    'Political Participation',
    'Political Culture',
    'Civil Liberties'
]

In [16]:
merged_df_17

Unnamed: 0,Country,Rank,Overall Score,Electoral Process and Pluralism,Functioning of Government,Political Participation,Political Culture,Civil Liberties
0,,,,Full democracies,,,,
1,Norway,1,9.87,10.00,9.64,10.00,10.00,9.71
2,Iceland,2,9.58,10.00,9.29,8.89,10.00,9.71
3,Sweden,3,9.39,9.58,9.64,8.33,10.00,9.41
4,New Zealand,4,9.26,10.00,9.29,8.89,8.13,10.00
...,...,...,...,...,...,...,...,...
166,Democratic Republic of Congo,163,1.61,0.50,0.71,2.22,3.75,0.88
167,Central African Republic,164,1.52,2.25,0.00,1.11,1.88,2.35
168,Chad,165,1.50,0.00,0.00,1.11,3.75,2.65
169,Syria,166,1.43,0.00,0.00,2.78,4.38,0.00


### 2018 Democracy Index

In [None]:
# visual debugging 
debug = cl.plot(last_table_17[0], kind="text")
debug

### 2023 Democracy Index

**Can get entire table of scores over time from this PDF**