[Python/pdfquery: Scraping the FIFA World Player of the Year votes](http://www.markhneedham.com/blog/2015/01/22/pythonpdfquery-scraping-the-fifa-world-player-of-the-year-votes-pdf-into-shape/) (here: compare with `pdftotext` and text processing)

In [1]:
import os
import io
import re
import csv
import urllib
import subprocess

URL = 'https://raw.githubusercontent.com/mneedham/fifa/master/fboaward_menplayer2014_neutral.pdf'
PDF = URL.rpartition('/')[2]
TXT = '%s.txt' % os.path.splitext(PDF)[0]
CSV = '%s.csv' % os.path.splitext(PDF)[0]

if not os.path.exists(PDF):
    urllib.urlretrieve(URL, PDF)

if not os.path.exists(TXT):  # requires one of popper-utils, miktex-poppler-bin, xpdf
    subprocess.check_call(['pdftotext', '-layout', PDF, TXT])

def load_pages(filename=TXT, encoding='utf-8', delimiter='\f'):
    with io.open(filename, encoding=encoding) as fd:
        text = fd.read()
    return text.split(delimiter)

pages = load_pages()

len(pages)

18

In [2]:
def iterrows(pages):
    first = None
    for p in pages:
        if not p:
            continue
        title, _, rest = p.partition('\n\n\n')
        header, _, rest = rest.partition('\n')
        header = re.split(' {2,}', header)
        if first is None:
            yield header
            first = title, header
        else:
            assert title, header == first
        table, _, footer = rest.partition('\n                              ')
        table, footer = (s.strip() for s in (table, footer))
        page, total = map(int, re.match(r'(\d+) / (\d+)$', footer).groups())
        for line in table.splitlines():
            if not line:
                continue
            if line.startswith(' '):
                assert line.endswith(' Grenadines')
                continue
            row = re.split(r' {2,}', line)
            assert len(row) == len(header)
            if row[1] == 'St. Vincent and the':
                row[1] += ' Grenadines'
            yield row

rows = list(iterrows(pages))
rows[:2]

[[u'Vote',
  u'Country',
  u'Name',
  u'First (5 points)',
  u'Second (3 points)',
  u'Third (1 point)'],
 [u'Captain',
  u'Afghanistan',
  u'Amiri Islam',
  u'Messi Lionel',
  u'Cristiano Ronaldo',
  u'Ibrahimovic Zlatan']]

In [3]:
rows[163]

[u'Captain',
 u'Sweden',
 u'Ibrahimovic Zlatan',
 u'Messi Lionel',
 u'Neuer Manuel',
 u'Cristiano Ronaldo']

In [4]:
def write_csv(rows, filename=CSV, encoding='utf-8', dialect=csv.excel):
    rows = iter(rows)
    with io.open(filename, 'wb') as fd:
        writer = csv.writer(fd, dialect=dialect)
        writer.writerow([c.encode(encoding)  for c in next(rows)])
        for n, r in enumerate(rows, 1):
            writer.writerow([c.encode(encoding) for c in r])
    return n

write_csv(iterrows(pages))

544

In [5]:
import pandas as pd

df = pd.read_csv(CSV, encoding='utf-8', dialect=csv.excel)

df.head()

Unnamed: 0,Vote,Country,Name,First (5 points),Second (3 points),Third (1 point)
0,Captain,Afghanistan,Amiri Islam,Messi Lionel,Cristiano Ronaldo,Ibrahimovic Zlatan
1,Captain,Albania,Cana Lorik,Cristiano Ronaldo,Robben Arjen,Mueller Thomas
2,Captain,Algeria,Bougherra Madjid,Cristiano Ronaldo,Robben Arjen,Benzema Karim
3,Captain,American Samoa,Luvu Rafe Talalelei,Neymar,Robben Arjen,Cristiano Ronaldo
4,Captain,Andorra,Sonejee Masand Oscar,Cristiano Ronaldo,Mueller Thomas,Kroos Toni


In [6]:
df['Vote'].value_counts()

Captain    182
Coach      181
Media      181
Name: Vote, dtype: int64

In [7]:
pd.set_option('display.max_rows', 250)

df.groupby('Country')['Vote'].value_counts().unstack().fillna(0, downcast='infer')

Vote,Captain,Coach,Media
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1,1,0
Albania,1,1,1
Algeria,1,1,1
American Samoa,1,1,0
Andorra,1,1,1
Angola,1,1,1
Anguilla,1,1,0
Antigua and Barbuda,1,1,1
Argentina,1,1,1
Armenia,1,1,1
