In [1]:
import json
import re

In [2]:
with open("raw/glossary.txt") as f:
    glossary_str = f.read()

glossary_lines = glossary_str.strip().split("\n")
glossary = dict()
for i in range(len(glossary_lines)//2):
    key = glossary_lines[2*i].strip()
    value = glossary_lines[2*i+1].strip()
    glossary[key] = value
    
with open("glossary.json", "w") as f:
    json.dump(glossary, f, indent=4)
    
glossary

{'Advantage': 'When the referee allows play to continue when an offence has occurred if this benefits the non-offending team.',
 'Administrating authority': 'The governing body for the game being played, e.g. The International Floorball Federation or a National Association.',
 'Bench penalty': 'A penalty that affects the number of players on the rink.',
 'Board': 'A low wall made in sections with rounded corners that encloses the rink. Usually made of plastic.',
 'Brutal': 'An act which is savage or vicious. Considered worse than being violent.',
 'Careless': 'An act which lacks consideration or precaution. Considered milder than being reckless.',
 'Centre line': 'A marked line on the floor which divides the rink into two halves of equal size.',
 'Centre spot': 'A marked spot on the floor used as a face-off dot at the beginning of a match or a period and after a goal is scored. Also where a penalty shot shall be taken from.',
 'Delayed penalty': 'When a penalty is awarded but play cont

In [3]:
with open("raw/rules.txt") as f:
    rules_str = f.read()
    
re_hand_sign = re.compile(r"\d\d\d") # regex to match hand sign references within rule text

def parse_rules(rules_str):
    headlines = dict() # headline - list of rules
    rules_str = rules_str.split("\n")
    
    current_headline = None
    current_rule = None
    
    for line in rules_str:
        if not line:
            continue
        
        first_word = line.split(" ")[0].strip()
        try:
            num = int(first_word)
            if num <= 10: # new chapter
                pass
            elif num >= 100: # new headline
                # Save previous headline
                if current_headline:
                    if current_rule:
                        current_headline["rules"].append(current_rule)
                        current_rule = None
                    headlines[current_headline["code"]] = current_headline
                
                # Reset
                current_headline = {
                    "code": num,
                    "headline": line.split(" ", 1)[1].strip(),
                    "rules": []
                }
            else:
                raise NotImplementedError()
            
        except ValueError:
            if first_word[-1] == ")": # new rule
                if current_rule:
                    current_headline["rules"].append(current_rule)
                
                rule_num = int(first_word[:-1])
                rule_text = line.split(")", 1)[1].strip()
                
                # Hand sign reference may be present in either rule text or headline
                matches1 = re_hand_sign.findall(current_headline["headline"])
                matches2 = re_hand_sign.findall(rule_text)
                matches = list(set(matches1 + matches2))
                sign_references = [int(m) for m in matches]
                
                current_rule = {
                    "num": rule_num,
                    "rule": rule_text,
                    "interpretations": [],
                    "sign_references": sign_references
                }
            else: # new interpretation
                current_rule["interpretations"].append(line)
                
    # Save final headline
    if current_rule:
        current_headline["rules"].append(current_rule)
    headlines[current_headline["code"]] = current_headline
        
    return headlines

rules = parse_rules(rules_str)

with open("rules.json", "w") as f:
    json.dump(rules, f, indent=4)

from pprint import pprint
#pprint(rules)

In [4]:
with open("raw/signs.txt") as f:
    signs_lines = [s.strip() for s in f.readlines()]
    
signs = dict()
for i in range(len(signs_lines) // 2):
    name_line = signs_lines[2*i]
    
    if not name_line:
        continue

    code, name = name_line.split(" ", 1)
    code = int(code)
    description = signs_lines[2*i+1]

    signs[code] = {
        "code": code,
        "name": name,
        "description": description
    }

with open("signs.json", "w") as f:
    json.dump(signs, f, indent=4)
    
pprint(signs)

{801: {'code': 801,
       'description': 'The fingertips held perpendicluar to the palm of the '
                      'hand',
       'name': 'Stoppage of game/Time out'},
 802: {'code': 802,
       'description': 'The forearms held horizontally, the palms of the hands '
                      'downwards',
       'name': 'Face-off'},
 803: {'code': 803,
       'description': 'The arm held horizontally in the advantage direction, '
                      'the palm of the hand downwards',
       'name': 'Hit-in'},
 804: {'code': 804,
       'description': 'The arm held horizontally in the advantage direction, '
                      'the palm of the hand downwards',
       'name': 'Free-hit'},
 805: {'code': 805,
       'description': 'The arm held in the advantage direction, the palm of '
                      'the hand upwards',
       'name': 'Advantage'},
 806: {'code': 806,
       'description': 'The arms held above the head and crossed at the wrist, '
                      'with the

# Analyze sizes

In [5]:
chunks = []


total_len = 0

for headline_code, headline in rules.items():
    for rule in headline["rules"]:
        rule_num = rule["num"]
        rule_id = f"h_{headline_code}.r_{rule_num}"
        chunk = f"Rule {rule_id} regarding '{headline['code']} {headline['headline']}': {rule['rule']}"
        rule_len = len(rule["rule"].split(" ")) * 1.33
        total_len += rule_len
        #print(f"{rule_id} tokens: {rule_len:.0f}")
        for interpretation_idx, interpretation in enumerate(rule["interpretations"]):
            interpretation_id = f"{rule_id}.i_{interpretation_idx}"
            intp_len = len(interpretation.split(" ")) * 1.33
            #print(f"{interpretation_id} tokens: {intp_len:.0f}")
            #total_len += intp_len
            chunk += f"\nInterpretation {interpretation_id}: {interpretation}"
            
        
        #print(chunk, end="\n\n\n")
        chunks.append(chunk)
            
#print(f"TOTAL {total_len:.0f} tokens")

with open("chunks.json", "w") as f:
    json.dump(chunks, f, indent=4)