In [2]:
from tika import parser # pdf scraping

import os
import pandas as pd
import re

In [4]:
pdf1 = parser.from_file('data/3011RL.pdf')
pdf2 = parser.from_file('data/3010RL4.pdf')

In [119]:
def get_year(competition):
    """
    Get year the competition was held
    :param competition: competition data scraped from the pdf [dict]
    :return: competition year [int]
    """

    competition_text = competition['content']
    upper_boundary = 'Official Results\n\n'
    upper_trim = competition_text[competition_text.find(upper_boundary)+len(upper_boundary) :]
    lower_boundary = '\n'
    lower_trim = upper_trim[: upper_trim.find(lower_boundary)]
    year = int(re.search('20[0,1][0-9]', lower_trim).group())

    return year


def get_jury(competition):
    """
    Get a list of juries at the competition. It is returned as list of tuples containing judge name and his nationality.
    :param competition: competition data scraped from the pdf [dict]
    :return: list of lists (jury_name, nationality) [list]
    """

    jury = []
    competition_text = competition['content']
    upper_boundary = '\n\nA '
    # Line below finds where to the jury list begins and '-2' is to remain the A character for iteration
    upper_trim = competition_text[competition_text.find(upper_boundary)+len(upper_boundary)-2 :]

    # I have to iterate with A-E characters, as in some cases there are unexpected blank lines between particular judges
    for letter in ['A', 'B', 'C', 'D', 'E']:
        upper_trim = upper_trim[upper_trim.find(letter+' '):]
        cut_line = upper_trim.split('\n')[0]
        nationality = cut_line[-3:]
        judge_name = re.sub(',', '', cut_line[2: -4])
        jury.append([judge_name, nationality])

    assert len(jury) == 5, f'Invalid number of judges! Got {len(jury)}'

    return jury


def get_competition_scores(competition):

    # Let's accept either dictionary or string as an input
    if type(competition) == dict:
        competition_text = competition['content']
    else:
        competition_text = competition

    # Remove dots after place number, if they appear
    competition_text = re.sub('[.] ', ' ', competition_text)

    # Search for text fragments which concern places and place there an easy to find split line
    splitline = '\nverystrangetextthatforsurewillnotoccurinthistext\n'
    for place in range(1, 51):
        competition_text = re.sub(f'\n\n {place} ', splitline, competition_text)

    ###########
    # TODO: change for detecting regex:
    # - define regex expression for score fragment
    # - replace '\n\n' expressions with '\n' which are not part of regex
    # - split text with regex for list and iterate for scores
    ###########

    # Remove blank lines, as they will ruin everything
    competition_text = re.sub('\n\n', '\n', competition_text)

    scores = []
    places_available = True

    # Find first split line and cut text above
    competition_text = competition_text[competition_text.find(splitline):]

    while places_available:

        # Find second splitline
        second_splitline_index = competition_text.find(splitline, competition_text.find(splitline)+1)

        # If found - split the text into place data and the rest and get place data, append to the scores list
        if second_splitline_index != -1:
            place_data = competition_text[:second_splitline_index].splitlines()
            competition_text = competition_text[second_splitline_index:]
            # Get jumper name with no comas and country
            jumper_name = re.sub(', ', ' ', place_data[3])
            jumper_country = place_data[5].split()[0]
            # Get first round notes
            jumper_notes = place_data[6].split()[3:8]
            # Append first round notes
            scores.append([jumper_name, jumper_country]+jumper_notes)
            # Check first character of first element of next line - if it is a digit, that means it also contains notes,
            # which means jumper took part in final round. Append score as new list element
            if len(place_data) > 7:
                if place_data[7].split()[0][0].isdigit():
                    jumper_notes = place_data[7].split()[3:8]
                    scores.append([jumper_name, jumper_country]+jumper_notes)

        # If not found - assume this is the last competitor and he took part only in first round. Get his score and return function value
        else:
            place_data = competition_text.splitlines()
            jumper_name = re.sub(', ', ' ', place_data[3])
            jumper_country = place_data[5].split()[0]
            jumper_notes = [place_data[6].split()[3:8]]

            scores.append([jumper_name, jumper_country, jumper_notes])
            places_available = False

    return scores

In [121]:
# Create DataFrame to store competition data - year and judges names
competition_data_columns = ['competition_id', 'year', 'referee_A', 'referee_B', 'referee_C', 'referee_D', 'referee_E']
competition_data_df = pd.DataFrame(columns=competition_data_columns)

# Create DataFrame to store all notes
scores_df_columns = ['competition_id', 'jumper_name', 'jumper_country', 'note_A', 'note_B', 'note_C', 'note_D', 'note_E']
scores_df = pd.DataFrame(columns=scores_df_columns)

# Create DataFrame to store jury countries
jury_data_columns = ['judge_name', 'judge_nationality']
jury_data_df = pd.DataFrame(columns=jury_data_columns)

# Fill DataFrame with competitions scores
data_path = 'data/'
competition_scores_columns = ['jumper_name', 'jumper_country', 'note_A', 'note_B', 'note_C', 'note_D', 'note_E']

for file_name in os.listdir(data_path):

    # Competition scores
    file_path = data_path + file_name
    print(f'Scanning {file_path}')
    pdf = parser.from_file(file_path)
    competition_scores_list = get_competition_scores(pdf)
    competition_scores_df = pd.DataFrame(competition_scores_list, columns=competition_scores_columns)
    competition_scores_df['competition_id'] = file_name
    scores_df = pd.concat([scores_df, competition_scores_df])
    print(f'Jumps performed: {len(competition_scores_list)}')

    # Competition data
    jury_names = [x[0] for x in get_jury(pdf)]
    competition_data_df = pd.concat([
        competition_data_df,
        pd.DataFrame([[file_name, get_year(pdf)] + jury_names], columns=competition_data_columns)
    ])

    # Jury data
    for jury in get_jury(pdf):
        if jury[0] not in jury_data_df['judge_name'].to_numpy():
            jury_data_df = pd.concat([
                jury_data_df,
                pd.DataFrame([jury], columns=jury_data_columns)
            ])

Scanning data/3010RL4.pdf
Jumps performed: 80
Scanning data/3011RL.pdf
Jumps performed: 80
Scanning data/3011RL4.pdf
Jumps performed: 61
Scanning data/3012RL.pdf
Jumps performed: 80
Scanning data/3013RL.pdf
Jumps performed: 80
Scanning data/3024RL4.pdf
Jumps performed: 79
Scanning data/3025RL4.pdf
Jumps performed: 80
Scanning data/3033RL4.pdf
Jumps performed: 78
Scanning data/3051RL4.pdf
Jumps performed: 80
Scanning data/3052RL4.pdf
Jumps performed: 80
Scanning data/3061RL.pdf
Jumps performed: 80
Scanning data/3062RL.pdf
Jumps performed: 80
Scanning data/3066RL4.pdf
Jumps performed: 79
Scanning data/3067RL4.pdf
Jumps performed: 80
Scanning data/3068RL4.pdf
Jumps performed: 80
Scanning data/3069RL4.pdf
Jumps performed: 80
Scanning data/3087RL.pdf
Jumps performed: 79
Scanning data/3088RL.pdf
Jumps performed: 80
Scanning data/3088RL4.pdf
Jumps performed: 80
Scanning data/3089RL4.pdf
Jumps performed: 80
Scanning data/3090RL4.pdf
Jumps performed: 80
Scanning data/3097RL4.pdf
Jumps performed

In [125]:
jury_data_df[jury_data_df['judge_nationality']=='POL'].sort_values(by='judge_name')

Unnamed: 0,judge_name,judge_nationality
0,Dlugopolski Kazimierz,POL
0,Gunka Ryszard,POL
0,Nadarkiewicz Edward,POL
0,Przybyta Edward,POL
0,Siderek Marek,POL
0,Tucznio Marek,POL
