In [7]:
from itertools import islice
from pathlib import Path
import json
import re

import requests

from mtg_load import load_mtg_oracle


parentheses_re = re.compile(r'\([^)]*\)')

def remove_parentheses(text):
    return parentheses_re.sub('', text)


def remove_punctuation(text: str) -> str:
    out = []
    for c in text:
        if c in {"\n", "\r", "\t"}:
            out.append(" ")
        if c.isspace() or c.isalnum():
            out.append(c)
        elif c in {"~", "{", "}", "+", "-", "/"}:
            out.append(c)
            if c == "}":
                out.append(" ")
        else:
            out.append(" ")

    return "".join(out)


def clean_oracle_text(text: str) -> list[str]:
    no_parentheses = remove_parentheses(text)
    no_newlines = no_parentheses.replace("\n", " ")
    no_punctuation = remove_punctuation(no_newlines)
    no_upper = no_punctuation.lower()

    return no_upper.split()


data = load_mtg_oracle()
data = ((card["name"], card.get("oracle_text")) for card in data)

# replace card names with ~
no_card_names = ((name, text.replace(name, "~")) for name, text in data if text is not None)

# remove punctuation
cleaned = ((name, clean_oracle_text(text)) for name, text in no_card_names)

# for name, text in word_splits:
#     print(name, " ".join(text), sep="\n", end="\n\n")

lpad = "<lpad>"
rpad = "<rpad>"

bag_of_words: dict[str, int] = {rpad: 0, lpad: 1}

index: int = 2
for name, text in cleaned:
    for word in text:
        if word in bag_of_words:
            continue
        bag_of_words[word] = index
        index += 1


def vectorize(text: str) -> list[int]:
    return [bag_of_words[word] for word in [lpad, *clean_oracle_text(text), rpad]]


def windows(seq: list[int], n: int) -> list[list[int]]:
    """Returns a sliding window (of width n) over data from the iterable"""
    it = iter(seq)
    result = [list(islice(it, n))]
    for e in it:
        result.append(result[-1][1:] + [e])
    return result

v = vectorize("Destroy target creature.")

v, windows(v, 3)

([1, 162, 26, 19, 0], [[1, 162, 26], [162, 26, 19], [26, 19, 0]])