In [1]:
# location: start;
# timestamp: none;
# note: after sample;
# lines: sample 1, note 2;
# song info: normal
start_source = """
1. Blade Runner [738 points] (87 groups, 116 songs, 221 samples)
  "Move on, move on." (Note: Police robot addressing crowd gathering after
   a shootout)
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble
"""

# location: star + after start;
# timestamp: one + multi;
# note: none + after samples + after band;
# lines: sample 2, note 2; + inner quote
# song info: normal
second_source = """
1. Blade Runner [738 points] (87 groups, 116 songs, 221 samples)
  "Move on, move on." @ 3:05
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble

  "'Move on, '
   'move on.'" @ 1:39, 2:10, 3:57, 5:12 (Note: Police robot addressing crowd gathering after
   a shootout)
  (Note: Police robot addressing crowd gathering after
   a shootout)
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble
    (Note: Police robot addressing crowd gathering after
    a shootout)
"""

source_variation = """
1. Blade Runner [738 points] (87 groups, 116 songs, 221 samples)
  "Move on, move on." (Note: Police robot addressing crowd gathering after
   a shootout)
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble

2. Blade Runner [738 points] (87 groups, 116 songs, 221 samples)
  "Move on, move on." @ 3:05
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble

  "'Move on, '
   'move on.'" @ 1:39, 2:10, 3:57, 5:12 (Note: Police robot addressing crowd gathering after
   a shootout)
  (Note: Police robot addressing crowd gathering after
   a shootout)
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble
    (Note: Police robot addressing crowd gathering after
    a shootout)
"""

from lark import Lark, Transformer
from rich import print as rprint

grammar = r"""
start: contained_string

contained_string: quoted | parenthesis | braces | brackets

quoted      : /([\"\'])(.*)(\1)/s
parenthesis : "(" STR_PAREN ")"
braces      : "{" STR_BRACE "}"
brackets    : "[" STR_BRACK "]"

STR_PAREN: (STRING | "[" | "]" | "{" | "}" | "\"" | "'")+
STR_BRACE: (STRING | "[" | "]" | "(" | ")" | "\"" | "'")+
STR_BRACK: (STRING | "(" | ")" | "{" | "}" | "\"" | "'")+
STRING: /[\w.,!?:\- \n]/
%ignore " "

"""
# CONT_STRING: LETTER+
# %import common.LETTER

samples = [
    ' "sam ple"',
    ' "sam\nple"',
    "'sa'mple'",
    """'samp"le'""",
    "[sample]",
    "(sam'p'le)",
    '(sam"p"le)',
    "{sample}",
    "[sa(mple]",
    "[sam(pl)e]",
    "\"'s'a'm'p'l'e'\"",
    "\"\n'Cernan: May's the month this year.'\n\""
]
expected = [
    "sam ple",
    "sam\nple",
    "sa'mple",
    """samp"le""",
    "sample",
    "sam'p'le",
    'sam"p"le',
    "sample",
    "sa(mple",
    "sam(pl)e",
    "'s'a'm'p'l'e'",
    "\n'Cernan: May's the month this year.'\n",
]

parser = Lark(grammar=grammar)

class CSTransformer(Transformer):
    def __default__(self, data, children, meta):
        return children[0].value

    def quoted(self, nodes):
        return nodes[0].strip("'").strip('"')

    def contained_string(self, nodes):
        return nodes[0]

    def start(self, nodes):
        return nodes[0]

transformer = CSTransformer()

for ind, (sample, expect) in enumerate(zip(samples, expected)):
    try:
        parsed = parser.parse(sample)
        transformed = transformer.transform(parsed)
        # par_val = parsed.children[0].children[0].children[0].value
        par_val = transformed
    except Exception as e:
        print(f"-----FAILED parsing {ind}------")
        print(str(e))
    try:
        assert par_val == expect
    except Exception as e:
        print(f"-----FAILED expectation {ind}------")
        print(par_val)


In [2]:
from rich import print as rprint
from collections import ChainMap
from lark import Tree, Discard, Lark, Transformer

samples_grammar = r"""
start: [_NL] sample_source+
sample_source: source_info usage
usage: usage_info+

source_info: order "." name points stats _NL
usage_info: samples "-" band_info [_NL]
samples: sample_info+

order: INT
name: SENTENCE
points: contained_string
stats: contained_string

sample_info: sample "*"* timestamps "*"* sample_note "*"* _NL
sample: contained_string
sample_note: contained_string?
timestamps: ("@" TIMESTAMP ("," TIMESTAMP)*)?
TIMESTAMP: INT ":" ["0"] INT

band_info: band ";" song ";" album _NL band_note?

band_note: contained_string _NL

contained_string: quoted | parenthesis | braces | brackets
quoted      : /([\"\'])(.*?)(\1)/s
parenthesis : "(" STR_PAREN ")"
braces      : "{" STR_BRACE "}"
brackets    : "[" STR_BRACK "]"

band:SENTENCE
song:SENTENCE
album:SENTENCE | contained_string

STR_PAREN: (STRING_NL | "[" | "]" | "{" | "}" | "\"" | "@" | "'" | "." | ";")+
STR_BRACE: (STRING_NL | "[" | "]" | "(" | ")" | "\"" | "@" | "'" | "." | ";")+
STR_BRACK: (STRING_NL | "(" | ")" | "{" | "}" | "\"" | "@" | "'" | "." | ";")+
STRING_NL: /[\w.,!?:\- \&=+*\n\"\/#<>°%]/
STRING: /[\w.,!?:\- \&\(\)=+*\"\/\'\^<>#$%@°]/
WORD: STRING+
SENTENCE: WORD [" " WORD]*

%import common.INT
%import common.LETTER
%ignore " "
%import common.CR
%import common.LF
_NL: CR? LF

"""

class SamplesTransformer(Transformer):
    def SENTENCE(self, node):
        return str(node).strip()
    def __default__(self, data, children, meta):
        return Tree(data, children, meta)
    def quoted(self, nodes):
        return nodes[0].strip("'").strip('"')
    def contained_string(self, nodes):
        if isinstance(nodes[0], str):
            return nodes[0]
        else:
            return nodes[0].children[0].value
    def sample_note(self, nodes):
        if nodes != []:
            note = nodes[0].replace('\n', '')
            import re
            note = re.sub(re.compile(r'  +'), ' ', note)
        else:
            note = None
        return {'sample_note': note}
    def timestamps(self, nodes):
        if len(nodes):
            return {'timestamps': [node.value for node in nodes]}
        else:
            return {'timestamps': None}
    def sample(self, nodes):
        return {'sample': nodes[0]}
    def band(self, nodes):
        return {'band': nodes[0]}
    def song(self, nodes):
        return {'song': nodes[0]}
    def album(self, nodes):
        return {'album': nodes[0]}
    def band_note(self, nodes):
        return {'band_note': nodes[0]}
    def points(self, nodes):
        return {'points': int(nodes[0].split()[0])}
    def name(self, nodes):
        return {'name': nodes[0]}
    def order(self, nodes):
        return {'order': int(nodes[0])}
    def stats(self, nodes):
        return {stat.split()[1]: int(stat.split()[0]) for stat in nodes[0].split(', ')}
    def sample_info(self, nodes):
        sample_info = dict(ChainMap(*nodes))
        if sample_info['sample_note'] is None and sample_info['timestamps'] is None and 'Note' in sample_info['sample']:
            print(f'Discarding: {sample_info}')
            return Discard
        else:
            return sample_info
    def band_info(self, nodes):
        return dict(ChainMap(*nodes))
    def source_info(self, nodes):
        return dict(ChainMap(*nodes))
    def samples(self, nodes):
        return nodes
    def usage_info(self, nodes):
        return {'samples': nodes[0], 'song': nodes[1]}
    def usage(self, nodes):
        return nodes
    def sample_source(self, nodes):
        return {'source': nodes[0], 'usage': nodes[1]}
    def start(self, nodes):
        return nodes
        
parser = Lark(grammar=samples_grammar)
transformer = SamplesTransformer()


In [14]:

# try:
#   parsed = parser.parse(sample)
#   transformed = transformer.transform(parsed)
#   rprint(transformed)
# except Exception as e:
#    print(str(e))

In [32]:
with open('samples_list.txt', 'r') as file:
    samples = file.read()

try:
  parsed = parser.parse(samples)
  # rprint(parsed)
  transformed = transformer.transform(parsed)
  # rprint(transformed)
except Exception as e:
   print(str(e))
# rprint(transformed)

Discarding: {'sample_note': None, 'timestamps': None, 'sample': 'Note: All samples taken from the dubbed German version'}
Discarding: {'sample_note': None, 'timestamps': None, 'sample': 'Note: Also available on the Brainchild version of\n   Enshrined on the album Mindwarp'}
Discarding: {'sample_note': None, 'timestamps': None, 'sample': 'Note: Also found on the Brainchild version of Course of Ruin on the\n   album Mindwarp'}
Discarding: {'sample_note': None, 'timestamps': None, 'sample': 'Note: Also found on the Brainchild version of Pale Reflection on the\n   album Mindwarp'}
Discarding: {'sample_note': None, 'timestamps': None, 'sample': 'Note: Taken from the dubbed German version'}
Discarding: {'sample_note': None, 'timestamps': None, 'sample': 'Note:   = It possible that these are not actual movie samples, but only\n   quotes from the movie spoken by someone.'}
Discarding: {'sample_note': None, 'timestamps': None, 'sample': 'Note: The Space Shuttle'}
Discarding: {'sample_note': Non