In [1]:
# location: start;
# timestamp: none;
# note: after sample;
# lines: sample 1, note 2;
# song info: normal
start_source = """
1. Blade Runner [738 points] (87 groups, 116 songs, 221 samples)
  "Move on, move on." (Note: Police robot addressing crowd gathering after
   a shootout)
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble
"""

# location: star + after start;
# timestamp: one + multi;
# note: none + after samples + after band;
# lines: sample 2, note 2; + inner quote
# song info: normal
second_source = """
1. Blade Runner [738 points] (87 groups, 116 songs, 221 samples)
  "Move on, move on." @ 3:05
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble

  "'Move on, '
   'move on.'" @ 1:39, 2:10, 3:57, 5:12 (Note: Police robot addressing crowd gathering after
   a shootout)
  (Note: Police robot addressing crowd gathering after
   a shootout)
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble
    (Note: Police robot addressing crowd gathering after
    a shootout)
"""

source_variation = """
1. Blade Runner [738 points] (87 groups, 116 songs, 221 samples)
  "Move on, move on." (Note: Police robot addressing crowd gathering after
   a shootout)
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble

2. Blade Runner [738 points] (87 groups, 116 songs, 221 samples)
  "Move on, move on." @ 3:05
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble

  "'Move on, '
   'move on.'" @ 1:39, 2:10, 3:57, 5:12 (Note: Police robot addressing crowd gathering after
   a shootout)
  (Note: Police robot addressing crowd gathering after
   a shootout)
    - Age of Chance; This is Crush Collision; One Thousand Years of Trouble
    (Note: Police robot addressing crowd gathering after
    a shootout)
"""

from lark import Lark, Transformer
from rich import print as rprint

grammar = r"""
start: contained_string

contained_string: quoted | parenthesis | braces | brackets

quoted      : /([\"\'])(.*)(\1)/s
parenthesis : "(" STR_PAREN ")"
braces      : "{" STR_BRACE "}"
brackets    : "[" STR_BRACK "]"

STR_PAREN: (STRING | "[" | "]" | "{" | "}" | "\"" | "'")+
STR_BRACE: (STRING | "[" | "]" | "(" | ")" | "\"" | "'")+
STR_BRACK: (STRING | "(" | ")" | "{" | "}" | "\"" | "'")+
STRING: /[\w.,!?:\- \n]/
%ignore " "

"""
# CONT_STRING: LETTER+
# %import common.LETTER

samples = [
    ' "sam ple"',
    ' "sam\nple"',
    "'sa'mple'",
    """'samp"le'""",
    "[sample]",
    "(sam'p'le)",
    '(sam"p"le)',
    "{sample}",
    "[sa(mple]",
    "[sam(pl)e]",
    "\"'s'a'm'p'l'e'\"",
    "\"\n'Cernan: May's the month this year.'\n\"",
]
expected = [
    "sam ple",
    "sam\nple",
    "sa'mple",
    """samp"le""",
    "sample",
    "sam'p'le",
    'sam"p"le',
    "sample",
    "sa(mple",
    "sam(pl)e",
    "'s'a'm'p'l'e'",
    "\n'Cernan: May's the month this year.'\n",
]

parser = Lark(grammar=grammar)


class CSTransformer(Transformer):
    def __default__(self, data, children, meta):
        return children[0].value

    def quoted(self, nodes):
        return nodes[0].strip("'").strip('"')

    def contained_string(self, nodes):
        return nodes[0]

    def start(self, nodes):
        return nodes[0]


transformer = CSTransformer()

for ind, (sample, expect) in enumerate(zip(samples, expected)):
    try:
        parsed = parser.parse(sample)
        transformed = transformer.transform(parsed)
        # par_val = parsed.children[0].children[0].children[0].value
        par_val = transformed
    except Exception as e:
        print(f"-----FAILED parsing {ind}------")
        print(str(e))
    try:
        assert par_val == expect
    except Exception as e:
        print(f"-----FAILED expectation {ind}------")
        print(par_val)

In [2]:
from rich import print as rprint
from collections import ChainMap
from lark import Tree, Discard, Lark, Transformer
import pandas as pd

samples_grammar = r"""
start: [_NL] sample_source+
sample_source: source_info usage
usage: usage_info+

source_info: order "." name points stats _NL
usage_info: samples "-" band_info [_NL]
samples: sample_info+

order: INT
name: SENTENCE
points: contained_string
stats: contained_string

sample_info: sample "*"* timestamps "*"* sample_note "*"* _NL
sample: contained_string
sample_note: contained_string?
timestamps: ("@" TIMESTAMP ("," TIMESTAMP)*)?
TIMESTAMP: INT ":" ["0"] INT

band_info: band ";" song ";" album _NL band_note?

band_note: contained_string _NL

contained_string: quoted | parenthesis | braces | brackets
quoted      : /([\"\'])(.*?)(\1)/s
parenthesis : "(" STR_PAREN ")"
braces      : "{" STR_BRACE "}"
brackets    : "[" STR_BRACK "]"

band:SENTENCE
song:SENTENCE
album:SENTENCE | contained_string

STR_PAREN: (STRING_NL | "[" | "]" | "{" | "}" | "\"" | "@" | "'" | "." | ";")+
STR_BRACE: (STRING_NL | "[" | "]" | "(" | ")" | "\"" | "@" | "'" | "." | ";")+
STR_BRACK: (STRING_NL | "(" | ")" | "{" | "}" | "\"" | "@" | "'" | "." | ";")+
STRING_NL: /[\w.,!?:\- \&=+*\n\"\/#<>°%]/
STRING: /[\w.,!?:\- \&\(\)=+*\"\/\'\^<>#$%@°]/
WORD: STRING+
SENTENCE: WORD [" " WORD]*

%import common.INT
%import common.LETTER
%ignore " "
%import common.CR
%import common.LF
_NL: CR? LF

"""


class SamplesTransformer(Transformer):
    def SENTENCE(self, node):
        return str(node).strip()

    def __default__(self, data, children, meta):
        return Tree(data, children, meta)

    def quoted(self, nodes):
        return nodes[0].strip("'").strip('"')

    def contained_string(self, nodes):
        if isinstance(nodes[0], str):
            return nodes[0]
        else:
            return nodes[0].children[0].value

    def sample_note(self, nodes):
        if nodes != []:
            note = nodes[0].replace('\n', '')
            import re

            note = re.sub(re.compile(r'  +'), ' ', note)
        else:
            note = None
        return {'sample_note': note}

    def timestamps(self, nodes):
        if len(nodes):
            return {'timestamps': str([node.value for node in nodes])}
        else:
            return {'timestamps': None}

    def sample(self, nodes):
        return {'sample': nodes[0]}

    def band(self, nodes):
        return {'band': nodes[0]}

    def song(self, nodes):
        return {'song': nodes[0]}

    def album(self, nodes):
        return {'album': nodes[0]}

    def band_note(self, nodes):
        return {'band_note': nodes[0]}

    def points(self, nodes):
        return {'points': int(nodes[0].split()[0])}

    def name(self, nodes):
        return {'name': nodes[0]}

    def order(self, nodes):
        return {'order': int(nodes[0])}

    def stats(self, nodes):
        return {stat.split()[1]: int(stat.split()[0]) for stat in nodes[0].split(', ')}

    def sample_info(self, nodes):
        sample_info = dict(ChainMap(*nodes))
        if (
            sample_info['sample_note'] is None
            and sample_info['timestamps'] is None
            and 'Note' in sample_info['sample']
        ):
            # print(f'Discarding: {sample_info}')
            file.write(f'{str(sample_info["sample"])}\n')
            return Discard
        else:
            return sample_info

    def band_info(self, nodes):
        return dict(ChainMap(*nodes))

    def source_info(self, nodes):
        return dict(ChainMap(*nodes))

    def samples(self, nodes):
        return nodes

    def usage_info(self, nodes):
        for node in nodes[0]:
            node.update(nodes[1])
        return nodes[0]

    def usage(self, nodes):
        usage = []
        for node in nodes:
            usage.extend(node)
        return usage

    def sample_source(self, nodes):
        for node in nodes[1]:
            node.update(nodes[0])
        return nodes[1]

    def start(self, nodes):
        start = []
        for node in nodes:
            start.extend(node)
        return start


parser = Lark(grammar=samples_grammar)
transformer = SamplesTransformer()

In [3]:
with open('samples_list.txt', 'r') as file:
    samples = file.read()

try:
    parsed = parser.parse(samples)
    with open('discards.txt', 'w') as file:
        transformed = transformer.transform(parsed)
    df = pd.DataFrame(transformed)
except Exception as e:
    print(str(e))
# rprint(transformed)
display(df)

Unnamed: 0,sample_note,timestamps,sample,album,song,band,groups,songs,samples,points,name,order,band_note,group,groups;
0,Note: Police robot addressing crowd gathering ...,,"Move on, move on.",One Thousand Years of Trouble,This is Crush Collision,Age of Chance,87.0,116.0,221.0,738,Blade Runner,1,,,
1,"Note: ""All those moments will be lost in time""",['0:12'],All diese Momente werden verloren sein in der ...,Half Rotten and Decayed,Silence besides the Sun,Amgod,87.0,116.0,221.0,738,Blade Runner,1,,,
2,"Note: ""Time to die.""",['0:24'],Zeit zu sterben.,Half Rotten and Decayed,Silence besides the Sun,Amgod,87.0,116.0,221.0,738,Blade Runner,1,,,
3,Note: Just before the final fight with Deckard,['0:48'],Roy howling like a wolf,Half Rotten and Decayed,Silence besides the Sun,Amgod,87.0,116.0,221.0,738,Blade Runner,1,,,
4,"Note: ""Attack ships on fire off the shoulder o...",['1:36'],"Gigantische Schiffe, die brannten, draußen vor...",Half Rotten and Decayed,Silence besides the Sun,Amgod,87.0,116.0,221.0,738,Blade Runner,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8278,,,music effects from the game,Fatalist,Fatalist (Creative Mix),Front Line Assembly,0.0,0.0,0.0,0,Splatter House 3 (Video Game),1292,,,
8279,,,The entire song is a loop of the music from Th...,Sexplosion!,Mood No. 5,My Life with the Thrill Kill Kult,0.0,0.0,0.0,0,Thunderball,1292,,,
8280,Note: This horn flare is actually part of the ...,['2:08'],horn sample,Sexplosion!,Sexplosion!,My Life with the Thrill Kill Kult,0.0,0.0,0.0,0,Thunderball,1292,,,
8281,,,May Day screaming in the mine pit after the ex...,In Slaughter Natives,Cryptic Slaughter,In Slaughter Natives,0.0,0.0,0.0,0,"View to a Kill, A",1292,,,


In [5]:
df.to_pickle('samples.pkl')
