In [58]:
from tika import parser # pdf scraping

import os
import pandas as pd
import re

In [4]:
pdf1 = parser.from_file('data/3011RL.pdf')
pdf2 = parser.from_file('data/3010RL4.pdf')

In [56]:
def get_year(competition):
    """
    Get year the competition was held
    :param competition: competition data scraped from the pdf [dict]
    :return: competition year [int]
    """

    competition_text = competition['content']
    upper_boundary = 'Official Results\n\n'
    upper_trim = competition_text[competition_text.find(upper_boundary)+len(upper_boundary) :]
    lower_boundary = '\n'
    lower_trim = upper_trim[: upper_trim.find(lower_boundary)]
    year = int(re.search('20[0,1][0-9]', lower_trim).group())

    return year


def get_jury(competition):
    """
    Get a list of juries at the competition. It is returned as list of tuples containing judge name and his nationality.
    :param competition: competition data scraped from the pdf [dict]
    :return: list of tuples (jury_name, nationality) [list]
    """

    jury = []
    competition_text = competition['content']
    upper_boundary = '\n\nA '
    # Line below finds where to the jury list begins and '-2' is to remain the A character for iteration
    upper_trim = competition_text[competition_text.find(upper_boundary)+len(upper_boundary)-2 :]

    # I have to iterate with A-E characters, as in some cases there are unexpected blank lines between particular judges
    for letter in ['A', 'B', 'C', 'D', 'E']:
        upper_trim = upper_trim[upper_trim.find(letter+' '):]
        cut_line = upper_trim.split('\n')[0]
        nationality = cut_line[-3:]
        judge_name = re.sub(',', '', cut_line[2: -4])
        jury.append((judge_name, nationality))

    assert len(jury) == 5, f'Invalid number of judges! Got {len(jury)}'

    return jury


def get_competition_scores(competition):

    # Let's accept either dictionary or string as an input
    if type(competition) == dict:
        competition_text = competition['content']
    else:
        competition_text = competition

    # Remove dots after place number, if they appear
    competition_text = re.sub('[.] ', ' ', competition_text)

    # Search for text fragments which concern places and place there an easy to find split line
    splitline = '\nverystrangetextthatforsurewillnotoccurinthistext\n'
    for place in range(1, 51):
        competition_text = re.sub(f'\n\n {place} ', splitline, competition_text)

    # Remove blank lines, as they will ruin everything
    competition_text = re.sub('\n\n', '\n', competition_text)

    scores = []
    places_available = True

    # Find first split line and cut text above
    competition_text = competition_text[competition_text.find(splitline):]

    while places_available:

        # Find second splitline
        second_splitline_index = competition_text.find(splitline, competition_text.find(splitline)+1)

        # If found - split the text into place data and the rest and get place data, append to the scores list
        if second_splitline_index != -1:
            place_data = competition_text[:second_splitline_index].splitlines()
            competition_text = competition_text[second_splitline_index:]
            jumper_name = re.sub(', ', ' ', place_data[3])
            jumper_country = place_data[5].split()[0]
            # Append first round notes
            jumper_notes = [place_data[6].split()[3:8]]
            # Check first character of first element of next line - if it is a digit, that means it also contains notes
            if len(place_data) > 7:
                if place_data[7].split()[0][0].isdigit():
                    jumper_notes.append(place_data[7].split()[3:8])

            scores.append([jumper_name, jumper_country, jumper_notes])

        # If not found - assume this competitor took part only in first series, get his score and return function value
        else:
            place_data = competition_text.splitlines()
            jumper_name = re.sub(', ', ' ', place_data[3])
            jumper_country = place_data[5].split()[0]
            jumper_notes = [place_data[6].split()[3:8]]

            scores.append([jumper_name, jumper_country, jumper_notes])
            places_available = False

    return scores

In [85]:
all_scores = []
data_path = 'data/'

for file_name in os.listdir(data_path):
    file_path = data_path + file_name
    print(f'Scanning {file_path}')
    pdf = parser.from_file(file_path)
    all_scores.append(get_competition_scores(pdf))
    print(f'Length: {len(all_scores[-1])}')

Scanning data/3010RL4.pdf
Length: 50
Scanning data/3011RL.pdf
Length: 50
Scanning data/3011RL4.pdf
Length: 31
Scanning data/3012RL.pdf
Length: 50
Scanning data/3013RL.pdf
Length: 50
Scanning data/3024RL4.pdf
Length: 49
Scanning data/3025RL4.pdf
Length: 50
Scanning data/3033RL4.pdf
Length: 48
Scanning data/3051RL4.pdf
Length: 50
Scanning data/3052RL4.pdf
Length: 50
Scanning data/3061RL.pdf
Length: 50
Scanning data/3062RL.pdf
Length: 50
Scanning data/3066RL4.pdf
Length: 49
Scanning data/3067RL4.pdf
Length: 50
Scanning data/3068RL4.pdf
Length: 50
Scanning data/3069RL4.pdf
Length: 50
Scanning data/3087RL.pdf
Length: 50
Scanning data/3088RL.pdf
Length: 50
Scanning data/3088RL4.pdf
Length: 50
Scanning data/3089RL4.pdf
Length: 50
Scanning data/3090RL4.pdf
Length: 50
Scanning data/3097RL4.pdf
Length: 50
Scanning data/3113RL4.pdf
Length: 50
Scanning data/3114RL4.pdf
Length: 50
Scanning data/3115RL4.pdf
Length: 51
Scanning data/3116RL.pdf
Length: 48
Scanning data/3117RL.pdf
Length: 50
Scanning d

In [87]:
scores_df_columns = ['jumper_name', 'jumper_country', 'note_A', 'note_B', 'note_C', 'note_D', 'note_E']
scores_df = pd.DataFrame(columns=scores_df_columns)
print(scores_df.head)

<bound method NDFrame.head of Empty DataFrame
Columns: [jumper_name, jumper_country, note_A, note_B, note_C, note_D, note_E]
Index: []>
