In [1]:
from tika import parser # pdf scraping

import pandas as pd
import re

In [2]:
pdf1 = parser.from_file('data/3011RL.pdf')
pdf2 = parser.from_file('data/3010RL4.pdf')

2023-02-09 18:15:50,787 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [99]:
def get_year(competition):
    """
    Get year the competition was held
    :param competition: competition data scraped from the pdf [dict]
    :return: competition year [int]
    """

    competition_text = competition['content']
    upper_boundary = 'Official Results\n\n'
    upper_trim = competition_text[competition_text.find(upper_boundary)+len(upper_boundary) :]
    lower_boundary = '\n'
    lower_trim = upper_trim[: upper_trim.find(lower_boundary)]
    year = int(re.search('20[0,1][0-9]', lower_trim).group())

    return year


def get_jury(competition):
    """
    Get a list of juries at the competition. It is returned as list of tuples containing judge name and his nationality.
    :param competition: competition data scraped from the pdf [dict]
    :return: list of tuples (jury_name, nationality) [list]
    """

    jury = []
    competition_text = competition['content']
    upper_boundary = '\n\nA '
    # Line below finds where to the jury list begins and '-2' is to remain the A character for iteration
    upper_trim = competition_text[competition_text.find(upper_boundary)+len(upper_boundary)-2 :]

    # I have to iterate with A-E characters, as in some cases there are unexpected blank lines between particular judges
    for letter in ['A', 'B', 'C', 'D', 'E']:
        upper_trim = upper_trim[upper_trim.find(letter+' '):]
        cut_line = upper_trim.split('\n')[0]
        nationality = cut_line[-3:]
        judge_name = re.sub(',', '', cut_line[2: -4])
        jury.append((judge_name, nationality))

    assert len(jury) == 5, f'Invalid number of judges! Got {len(jury)}'

    return jury


def get_score(competition, place=1):

    '''
    Get score of the particular place. This function returns only first competitor of the particular place. To find ex aequo competitors use get_exaequo() function.
    :param competition: dictionary loaded from pdf[dict]
    :param place: competitor's place [int]
    :return: list containing competitor's name [str], competitor's country [str] list of lists of notes [str]; return -1 if there was no this place in this competition
    '''

    # Let's accept either dictionary or string as an input
    if type(competition) == dict:
        competition_text = competition['content']
    else:
        competition_text = competition

    # Find string index, where part of this place begins
    place_index = competition_text.find(f'\n\n {place} ')

    # If place not found return -1
    if place_index == -1:
        return -1

    # Cut text above and below needed part
    upper_trim = competition_text[place_index+3:] # '+3' cuts the '\n\n ' part
    cut_point = upper_trim.find('\n\n', upper_trim.find('\n\n')+1) # Find second '\n\n' occurrence to separate the particular result

    # Get values
    competitor_data = upper_trim[:cut_point].splitlines()
    competitor_place = int(competitor_data[0].split(' ')[0]) # not used for the moment
    competitor_name = re.sub(',', '', competitor_data[1]) # return name with no comas
    competitor_country = competitor_data[4].split(' ')[0]
    competitor_notes = []

    # Get notes from one or two rounds if qualified to final
    for notes in competitor_data[5:]:
        round_notes = notes.split(' ')[3:8]
        competitor_notes.append(round_notes)

    return [competitor_name, competitor_country, competitor_notes]


def get_exaequo(competition, place):

    '''
    Get list of ex aequo competitors, not found by get_score() function
    :param competition: dictionary loaded from pdf[dict]
    :param place: competitor's place [int]
    :return: list of lists containing competitor's name [str], competitor's country [str] and list of lists of notes [str]; return -1 if there was no this place in this competition
    '''

    # Let's accept either dictionary or string as an input
    if type(competition) == dict:
        competition_text = competition['content']
    else:
        competition_text = competition

    # Split text to a list with following structure:
    # [0] element is everything above the first ex aequo competitor
    # [1] is the first ex aequo competitor
    # [2] and following - rest of ex aequo competitors
    split_text = competition_text.split(f'\n\n {place} ')

    # Find index of the second '\n\n' substring in the last element and cut it after
    # This is the part where data of this competitor ends
    nn_index = split_text[-1].find('\n\n', split_text[-1].find('\n\n')+1)
    split_text[-1] = split_text[-1][:nn_index]

    exaequo_scores = []

    for competitor in split_text[2:]:
        competitor_data = competitor.splitlines()
        competitor_name = re.sub(',', '', competitor_data[1]) # return name with no comas
        competitor_country = competitor_data[4].split(' ')[0]
        competitor_notes = []

        # Get notes from one or two rounds if qualified to final
        for notes in competitor_data[5:]:
            round_notes = notes.split(' ')[3:8]
            competitor_notes.append(round_notes)

        exaequo_scores.append([competitor_name, competitor_country, competitor_notes])

    if len(exaequo_scores) == 0:
        return -1

    return exaequo_scores


def get_competition_scores(competition):
    scores = []
    place_to_check = 1
    places_available = True

    while places_available:
        score_to_append = get_score(competition, place=place_to_check)
        # If anyone got this place:
        if score_to_append != -1:
            scores.append(score_to_append)
            place_to_check += 1
        # If no one had this place, get one place back and check for ex aequos:
        else:
            place_to_check -= 1
            score_to_append = get_exaequo(competition, place=place_to_check)
            # If there were ex aequos, append elements of returned list
            if score_to_append != -1:
                for element in score_to_append:
                    scores.append(element)
                place_to_check += len(score_to_append) + 1
            # Otherwise it means there are no competitors left!
            else:
                places_available = False

    return scores

# TODO
Some pdfs have dot after competitor's place and some do not. Fix this shit.

In [101]:
for element in get_competition_scores(pdf2):
    print(element)

In [69]:
for element in get_exaequo(pdf1, place=5):
    print(repr(element))
    print()

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nResult_WK2_Indi_Official.PDF\n\n\nFIS INTERNET Service: www.fis-ski.com\n\nFIS Data Service data processing realized by\nPrint Date: FRI 23 NOV 2001 / Print Time: 20:05\n\n"ruhrgas" FIS World Cup Ski-Jumping\n1st World Cup Competition\n\nSki-Jumping Individual K120\nOfficial Results\n\nKuopio FRI 23 NOV 2001 Start Time:  18:15\n\nFinish Time:  19:44\n\nJury / Competition Management Judges Hill Data\nRace Director Hofer, Walter FIS\nTechnical Delegate Bailly, Pierre FRA\nChief of Competition Korhonen, Keijo FIN\nAssistant TD Zarucchi, Fredi SUI\nAssistant RD Tepes, Miran FIS\n\nA Goetze, Klaus-Dieter GER\nB Suikkanen, Esa FIN\nC Mori, Hidemitsu JPN\nD Bosin, Bruno ITA\nE Nordin, Hans SWE\n\nK-Point: 120 m\nMeter Value: 1.8 Points/m\nJury Distance: 131 m\n\nRank Bib\nName\nClub\n\nNat\nSpeed\n[km/h]\n\nDistance\n[m]\n\nDistance\nPoints\n\nJudges Marks\nA B C D E\n\nJudges\nPoints\n\nRound\nTotal\n\nRound\nRank\n\n

In [46]:
print(get_score(pdf1, place=7))

['Hocke Stefan', 'GER', [['18.5', '18.5', '18.5', '18.0', '18.0'], ['18.5', '18.5', '18.5', '19.0', '18.0']]]


In [6]:
print(get_jury(pdf1))
print(get_jury(pdf2))

[('Goetze Klaus-Dieter', 'GER'), ('Suikkanen Esa', 'FIN'), ('Mori Hidemitsu', 'JPN'), ('Bosin Bruno', 'ITA'), ('Nordin Hans', 'SWE')]
[('Normann Sigbjoern', 'NOR'), ('Gunka Ryszard', 'POL'), ('Glawischnig Heinz', 'AUT'), ('Patzina Wolfgang', 'GER'), ('Bester Janez', 'SLO')]


In [19]:
print(pdf1['content'])






































Result_WK2_Indi_Official.PDF


FIS INTERNET Service: www.fis-ski.com

FIS Data Service data processing realized by
Print Date: FRI 23 NOV 2001 / Print Time: 20:05

"ruhrgas" FIS World Cup Ski-Jumping
1st World Cup Competition

Ski-Jumping Individual K120
Official Results

Kuopio FRI 23 NOV 2001 Start Time:  18:15

Finish Time:  19:44

Jury / Competition Management Judges Hill Data
Race Director Hofer, Walter FIS
Technical Delegate Bailly, Pierre FRA
Chief of Competition Korhonen, Keijo FIN
Assistant TD Zarucchi, Fredi SUI
Assistant RD Tepes, Miran FIS

A Goetze, Klaus-Dieter GER
B Suikkanen, Esa FIN
C Mori, Hidemitsu JPN
D Bosin, Bruno ITA
E Nordin, Hans SWE

K-Point: 120 m
Meter Value: 1.8 Points/m
Jury Distance: 131 m

Rank Bib
Name
Club

Nat
Speed
[km/h]

Distance
[m]

Distance
Points

Judges Marks
A B C D E

Judges
Points

Round
Total

Round
Rank

Total

 1 50
Malysz, Adam
KS "Wisla" Wisla

POL 254.0
88.9 123.5 66.3 19.5 19.5 19.0 19.0 19.0 57.5

In [23]:
print(pdf2['content'])













































Result_WK2_Indi_Official.PDF


FIS INTERNET Service: www.fis-ski.com
FIS Data Service data processing realized by

Print Date: SAT 22 MAR 2003 / Print Time: 11:55

"ruhrgas" FIS World Cup Ski-Jumping
27th World Cup Competition SKIFLYING

Planica

p r e s e n t e d     b y Ski-Flying Individual K185

Official Results

SAT 22 MAR 2003

Start Time:  10:01
Finish Time:  11:54

Jury / Competition Management Judges Hill Data
Race Director Hofer Walter FIS

Technical Delegate Palsrud Bertil NOR
Chief of Competition Gorisek Janez SLO
Assistant TD Salvi Franck FRA

Assistant RD Tepes Miran FIS

A Normann Sigbjoern NOR

B Gunka Ryszard POL
C Glawischnig Heinz AUT
D Patzina Wolfgang GER

E Bester Janez SLO

K-Point: 185 m

Meter Value: 1.2 Points/m
Jury Distance: 215 m

Rank Bib
Name
Club

Nat
Speed
[km/h]

Distance
[m]

Distance
Points

Judges Marks
A B C D E

Judges
Points

Round
Total

Round
Rank

Total

 1. 42
Hautamaeki Matti
Puijon Hiihtoseura

FI

In [24]:
type(pdf1)

dict