# Goals:
- (convert) simulate left arrow until end with delay
- (custom ocr) image to text
- (tokenizer) text to words
- (anki-tool) words to anki



Rules for this notebook: 
- imports all at top, 
- make everything into functions
- modularize and streamline

In [1]:
# %pip install pyautogui
# %pip install PyGetWindow
# %pip install keyboard
# %pip install PyQt5 --user
# %pip install win32gui

In [2]:
import functools
import json

import os
import re
import subprocess
import sys
import time
import urllib.request
from collections import defaultdict
from datetime import (
    date,
    datetime,
    timedelta
)
from pathlib import Path

import keyboard
import MeCab
import pyautogui
import pygetwindow
import win32gui
from jisho_api.word import Word
from PIL import Image, ImageGrab


# (automation)
- find kindle window
- take screenshot
- left arrow, wait, repeat

In [3]:
# Page Turner and Screen Capture
def page_turner(seconds, output_dir):

    # This Kindle Window part was ChatGPT assisted
    print("Focusing on Kindle window")
    kindle_window = None
    for window in pygetwindow.getAllTitles():
        if "Kindle" in window:
            kindle_window = pygetwindow.getWindowsWithTitle(window)[0]
            kindle_window.activate()
            break
    if not kindle_window:
        print("Kindle is not open. Ending")
        return 

    print("page_turner starting in 3 seconds...")
    time.sleep(3)
    if not os.path.exists(output_dir):
        print(f"Output dir does not exist: {output_dir}")
        return
    for t in range(seconds):
        timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
        sc: Image = pyautogui.screenshot(f'{output_dir}/{timestamp}_{t}.png')
        pyautogui.press('left')
        time.sleep(1)

In [4]:
# It worked!! -> did on 
# -> K Evan's Kindle for PC -GA文庫 & GAノベル 2018年5月の新刊 全作品立読み(合本版) (GA文庫) (Japanese Edition)
# page_turner(60, "full-book-notes-img")

# (ocr)
- take img dir
- run each img through ocr script
- save each txt to another dir

In [5]:
def run_ocr(input_file = "file/img.jpg", output_file = "file/imgjpg.txt"):
    custom_ocr_path = "/nav/gith/jp-card-gen/custom-ocr/src/"
    cmd = f'cd {custom_ocr_path} && python -u main.py -m main -i "{input_file}" -o "{output_file}" && echo "Done" '

    """
    Below Part Generated by ChatGPT
    """
    try:
        # Run the command
        process = subprocess.run(
            cmd,
            shell=True,                # Use shell=True to execute the command in a shell
            stdout=subprocess.PIPE,    # Capture standard output
            stderr=subprocess.PIPE,    # Capture standard error
            text=True                  # Decode output to strings
        )

        # Wait for the process to complete
        # subprocess.run does this automatically

        # Access the output
        stdout = process.stdout
        stderr = process.stderr
        return_code = process.returncode

        # print(f"Standard Output:\n{stdout}")
        # print(f"Standard Error:\n{stderr}")
        # print(f"Return Code: {return_code}")
        print(f"RC: {return_code}, for {input_file}, {output_file}")

    except Exception:
        print("An error occurred: ")

In [6]:
def convert_book_to_txt(image_dir, text_dir):
    if not os.path.exists(image_dir):
        print(f"Image directory does not exist: {image_dir}")
        return 

    # Gather files
    directory = Path(image_dir)
    # maybe could simplify this?
    files = sorted(file.name for file in directory.iterdir() if file.is_file())
    ios = []
    for file in files:
        # print(file)
        # if "png" in file:
        file_base_name = file.split(".")[0]
        ios.append((image_dir+file, text_dir+file_base_name+".txt"))
    print(len(ios))
    if not os.path.exists(text_dir):
        os.mkdir(text_dir)

    for i, o in ios:
        # print(i, o)
        run_ocr(input_file=i, output_file=o)

In [7]:
dir_base_name = "full-book-notes"
full_path = f"/nav/gith/jp-card-gen/{dir_base_name}"
# r = range(0, 5)
# r = range(5, 60)

# This worked!!!
# convert_book_to_txt(image_dir=full_path+"-img/", text_dir=full_path+"-text/")

# (separate)
- loop over all text files
- (take from jp-lang-tokenizers.ipynb) mecab each file
- 

In [8]:
dictdir = "c:\\Python312\\Lib\\site-packages\\unidic_lite\\dicdir"

In [9]:
def clean(words):
    """Remove spaces, and all punctuation"""
    new_hl = []
    punc = ''' !()-[]{};:'"\\,<>./?@#$^&*_~「」（）・＿。｛｝；：！
    『』【】…？,、－、〟
    '''
    # also remove numbers
    numbers = "1234567890１２３４５６７８９０"
    for hl in words:
        for c in hl:
            if c in punc:
                hl = hl.replace(c, "")
            elif c in numbers:
                hl = hl.replace(c, "")

        if hl != '':
            new_hl.append(hl)
    return new_hl

def exclude_kana_only_words(word_list):
    """
    GENERATED BY CHATGPT
    Exclude words that consist only of Hiragana, Katakana, or related symbols.
    
    Args:
    - word_list (list of str): A list of words to filter.
    
    Returns:
    - list of str: Words that are not exclusively Kana.
    """
    # Regular expression for Hiragana, Katakana, and associated symbols
    kana_pattern = re.compile(r'^[\u3040-\u309F\u30A0-\u30FFーぁィっa-zA-Z]+$')
    
    # Filter out Kana-only words
    return [word for word in word_list if not kana_pattern.fullmatch(word)]

In [10]:
def create_list(text, no_print=False):
    # tagger = MeCab.Tagger() # change where the mecabrc / dictdir is...
    tagger = MeCab.Tagger(f'-r "{dictdir+"\\mecabrc"}" -d "{dictdir}"')
    tag_parsed = set()
    for line in tagger.parse(text).splitlines():
        ls = line.split()
        if not no_print:
            print(len(ls), ls)
        word = None
        for item in reversed(ls):
            if "動詞" in item:
                word = ls[3]
                break
                # ls[3] is dict form for verbs
            if "形容詞" in item:
                word = ls[3]
                # ls[3] is dict form for adjs
            if "名詞" in item:
                word = ls[0]
                break
        if word==None:
            word = ls[0]
        tag_parsed.add(word)
    return tag_parsed

In [11]:
# TODO LATER: sort by kana...
def text_parse_clean(text_dir) -> (set[str], list[str]): 
    """Create word list from text file"""
    # Modified to take a list intead of single
    if not os.path.exists(text_dir):
        print(f"Image directory does not exist: {text_dir}")
        return 

    # Gather files
    directory = Path(text_dir)
    fpaths = sorted(file for file in directory.iterdir() if file.is_file())

    all_text = set()
    all_results = []
    all_raw: int = 0
    all_cleaned: int = 0


    for fpath in fpaths:
        with open(fpath, 'r', encoding='utf-8') as file:
            text = file.read()
        text_parsed = create_list(text, no_print=True)
        text_cleaned = clean(text_parsed)
        text_kanaless = exclude_kana_only_words(text_cleaned)
        # print(sorted(text_kanaless))
        
        results = f"{len(text_parsed)} -> {len(text_cleaned)} -> {len(text_kanaless)} -> {fpath}"
        print(results)

        all_raw += len(text_parsed)
        all_cleaned += len(text_cleaned)
        all_results.append(results)

        for word in text_kanaless:
            all_text.add(word)

    total = f"Files: {len(fpaths)} ---> Raw: {all_raw} ---> Cleaned: {all_cleaned} ---> Total: {len(all_text)}"
    print(total)
    all_results.append(total)
    return all_text, all_results

In [12]:
def write_file(FPATH, word_list):
    with open(FPATH, 'w', encoding='utf-8') as file:
        for item in word_list:
            file.write(f"{item}\n")

In [13]:
OLD_FILE_DIR = "full-book-notes-text"
NEW_FILE_PATH = "full-book-notes-results/parsed.txt"
LOG_FILE_PATH = "full-book-notes-results/parsed.log"

In [14]:
full_list, results = text_parse_clean(OLD_FILE_DIR)
write_file(NEW_FILE_PATH, full_list)
write_file(LOG_FILE_PATH, results)

358 -> 336 -> 205 -> full-book-notes-text\2024-12-08T20-15-15_0.txt
371 -> 345 -> 214 -> full-book-notes-text\2024-12-08T20-15-16_1.txt
443 -> 417 -> 250 -> full-book-notes-text\2024-12-08T20-15-17_2.txt
351 -> 322 -> 170 -> full-book-notes-text\2024-12-08T20-15-19_3.txt
294 -> 268 -> 149 -> full-book-notes-text\2024-12-08T20-15-20_4.txt
64 -> 43 -> 7 -> full-book-notes-text\2024-12-08T20-15-21_5.txt
370 -> 345 -> 207 -> full-book-notes-text\2024-12-08T20-15-23_6.txt
375 -> 351 -> 222 -> full-book-notes-text\2024-12-08T20-15-24_7.txt
119 -> 93 -> 34 -> full-book-notes-text\2024-12-08T20-15-25_8.txt
72 -> 50 -> 11 -> full-book-notes-text\2024-12-08T20-15-27_9.txt
116 -> 93 -> 34 -> full-book-notes-text\2024-12-08T20-15-28_10.txt
136 -> 105 -> 58 -> full-book-notes-text\2024-12-08T20-15-29_11.txt
106 -> 86 -> 24 -> full-book-notes-text\2024-12-08T20-15-30_12.txt
64 -> 42 -> 6 -> full-book-notes-text\2024-12-08T20-15-32_13.txt
97 -> 78 -> 24 -> full-book-notes-text\2024-12-08T20-15-33_14.

# (comparator)
- (shorten anki-tool.ipynb) anki comparator tool

In [15]:
def request(action, **params):
    return {'action': action, 'params': params, 'version': 6}

def invoke(action, debug=False, **params):
    requestJson = json.dumps(request(action, **params)).encode('utf-8')
    if debug: 
        print(requestJson)
    response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson)))
    if debug: 
        print(response)
    if len(response) != 2:
        raise Exception('response has an unexpected number of fields')
    if 'error' not in response:
        raise Exception('response is missing required error field')
    if 'result' not in response:
        raise Exception('response is missing required result field')
    if response['error'] is not None:
        raise Exception(response['error'])
    return response['result']

In [16]:
# https://stackoverflow.com/questions/33338713/filtering-out-all-non-kanji-characters-in-a-text-with-python-3
# filter out others...

def is_not_hiragana_or_katakana(char):
    """
    Check if a character is not hiragana or katakana
    *from ChatGPT

    # Example usage
    characters = "あいうえおアイウエオ漢字"
    filtered_characters = [char for char in characters if is_not_hiragana_or_katakana(char)]

    print("Original Characters:", characters)
    print("Filtered Characters:", "".join(filtered_characters))
    """
    import re
    # Define regular expressions for Hiragana and Katakana
    hiragana_pattern = re.compile(r'[\u3041-\u3096ー]')  # Hiragana
    katakana_pattern = re.compile(r'[\u30A0-\u30FFー]')  # Katakana

    # Check if the character is not a Hiragana or Katakana character
    return not (hiragana_pattern.match(char) or katakana_pattern.match(char))

In [17]:
def get_kanji(word_list):
    # from gpt
    with open(word_list, encoding='utf-8') as f:
        lines = [line.rstrip('\n') for line in f.readlines()]

    # set
    kanji_to_search = set()
    for line in lines:
        for char in line:
            # TODO (ebui): filter out all non kanji characters
            if is_not_hiragana_or_katakana(char):
                kanji_to_search.add(char)

    # Get Entire Deck (findCards -> cardsInfo) 7s
    query = "deck:nhg::1kanji"
    search_result = invoke("findCards", query=query)
    card_info = invoke("cardsInfo", cards=search_result)

    # Create Map
    cards = dict()
    failed = list()
    for card in card_info:
        try:
            kanji = card["fields"]["Kanji"]["value"]
            cards[kanji] = card["interval"]
        except KeyError as ke:
            failed.append(card)


    # Search
    to_learn = set()
    learned = set()
    not_found = set()
    for kanji in kanji_to_search:
        try:
            if cards[kanji] == 0:
                to_learn.add(kanji)
            elif cards[kanji] > 0:
                learned.add(kanji)
        except Exception:
            not_found.add(kanji)

    print("--")
    print("To learn:\t", len(to_learn), to_learn)
    print("Learned:\t", len(learned), learned)
    print("Not Found:\t", len(not_found), not_found)

    return to_learn, learned

In [18]:
def get_words(word_list):
    # from gpt
    with open(word_list, encoding='utf-8') as f:
        lines = [line.rstrip('\n') for line in f.readlines()]

    # set
    words_to_search = set(lines)

    # Get Entire Deck (findCards -> cardsInfo) 7s
    query = "deck:nhg::3vocab"
    search_result = invoke("findCards", query=query)
    card_info = invoke("cardsInfo", cards=search_result)

    # Create Map
    cards = dict()
    failed = list()

    to_skip = set()
    for card in card_info:
        try:
            vocab = card["fields"]["Word"]["value"]
            if card["deckName"] == SKIP_DECK:
                # to_skip.add(vocab)
                cards[vocab] = -100000
            else:
                cards[vocab] = card["interval"]
        except KeyError as ke:
            failed.append(card)

    # Info
    to_learn = set()
    learned = set()
    not_found = set()
    for word in words_to_search:
        try:
            if cards[word] == 0:
                to_learn.add(word)
            elif cards[word] > 0:
                learned.add(word)
            elif cards[word] == -100000:
                to_skip.add(word)
        except Exception:
            not_found.add(word)

    print("--")
    print("to learn:\t", len(to_learn), to_learn)
    print("learned:\t", len(learned), learned)
    print("to add:\t", len(not_found), not_found)
    print("to skip:\t", len(to_skip), to_skip)
    print("failed:\t",  len(failed), failed)
    # return to_learn.union(not_found), learned, to_skip, failed
    return to_learn.union(not_found), learned, to_skip

In [19]:
def analyze_list(word_list):
    kanji_unlearned, kanji_learned = get_kanji(word_list)
    words_unlearned, words_learned, to_skip = get_words(word_list)

    # word true/false, kanji true/false
    wf_kt = set() # Easy Words
    wf_kf = set() # Low Prio Words
    wt_kf = set() # High Prio Kanji
    wt_kt = set() # Known


    for word in words_unlearned:
        known = True
        for char in word:
            if char in kanji_unlearned:
                wf_kf.add(word)
                known = False
                break
        if known:
            wf_kt.add(word)
    for word in words_learned:
        known = True
        for char in word:
            if char in kanji_unlearned:
                wt_kf.add(word)
                known = False
                break
        if known:
            wt_kt.add(word)

    # Warning: ignores words with kanji not found, doesn't separate words not found
    print("--")
    print("wf_kt,\tEasy Words:", len(wf_kt), sorted(wf_kt, key=len))
    print("wf_kf,\tLow Prio Word:", len(wf_kf), wf_kf)
    print("wt_kf,\tHigh Prio Kanji:", len(wt_kf), wt_kf)
    print("wt_kt,\tKnown Words:", len(wt_kt), sorted(wt_kt, key=len))
    print("UNNEEDED (SKIP): \t", len(to_skip), sorted(to_skip, key=len))
    print()
    # easy, low, kanji, known
    return  sorted(wf_kt, key=len), sorted(wf_kf, key=len)

In [21]:
SKIP_DECK = "nhg::3vocab::unneeded"
FILE_NAME = "full-book-notes-results/parsed.txt"
easy, low = analyze_list(FILE_NAME)

# THESE ANALYSIS RESULTS NEED TO BE LOGGED in addition to the printing

--
To learn:	 27 {'遥', '焉', '訝', '漉', '絨', '楕', '畏', '燻', '堵', '鱗', '猪', '閑', '漆', '憐', '沃', '冨', '淳', '牽', '翠', '於', '媚', '弥', '芳', '某', '恭', '呉', '瞑'}
Learned:	 1068 {'崩', '型', '補', '皮', '廊', '論', '飾', '徴', '要', '略', '刊', '輝', '遂', '転', '憶', '死', '予', '息', '自', '学', '伐', '鉱', '痴', '伸', '敷', '題', '肉', '規', '南', '認', '必', '乱', '称', '客', '召', '元', '像', '是', '刷', '距', '準', '丸', '週', '雷', '才', '復', '籠', '特', '介', '闘', '掻', '尾', '運', '土', '減', '積', '友', '料', '針', '暗', '表', '恐', '弱', '組', '口', '示', '粗', '旨', '費', '触', '配', '研', '理', '大', '正', '重', '魔', '巣', '兄', '制', '豹', '読', '夜', '火', '実', '塊', '択', '戸', '性', '咲', '練', '負', '部', '根', '華', '頂', '駐', '会', '直', '戦', '賑', '豆', '紹', '黒', '経', '語', '馬', '玉', '偉', '夢', '価', '務', '奪', '何', '青', '吐', '浮', '軽', '悔', '迫', '裕', '剣', '念', '響', '主', '膝', '他', '鼻', '記', '堂', '徹', '黄', '了', '員', '文', '去', '章', '財', '傾', '前', '刃', '飲', '羽', '審', '竹', '績', '喧', '番', '光', '沈', '晴', '隣', '柄', '精', '枚', '限', '件', '羊', '立', '草', '画', '駄', '熟', '足', '彼', '異', '

# (batch add)
- anki batch add (+ more custom tags)
- save results to a text file (-2,-1,0,1,2)


Is JISHO is slow?:
- 142 -> 37min, 156 -> 32min, 128 -> 24min, 102 -> 23min.
- -> in total for 528 words -> avg 13.32 seconds per addition
- Need another card creation api, maybe look into how yomitan does it?
- bottleneck may just be Anki though.
- and our way of Anki SCHEDULING, not adding cards
- maybe somehow we can parallelize scheduling?

In [31]:
"""
def create_furigana(word, reading):
    return "{}[{}]".format(word, reading)


def create_meaning(meanings):
    meaning_field = []
    for definition in meanings:
        defs = []
        for word in definition.english_definitions:
            defs.append(word)
            # print(word)
        one_def = ", ".join(defs)
        meaning_field.append(one_def)
    return f" (*) ".join(meaning_field)
"""
# no jisho

'\ndef create_furigana(word, reading):\n    return "{}[{}]".format(word, reading)\n\n\ndef create_meaning(meanings):\n    meaning_field = []\n    for definition in meanings:\n        defs = []\n        for word in definition.english_definitions:\n            defs.append(word)\n            # print(word)\n        one_def = ", ".join(defs)\n        meaning_field.append(one_def)\n    return f" (*) ".join(meaning_field)\n'

## setup dictionary

In [32]:
JITENDEX_PATH = "/nav/jitendex-yomitan/"

def load_entire_dict():
    results = []
    cached_ranges = defaultdict(list)
    for filename in os.listdir(JITENDEX_PATH):
        if filename.endswith('.json') and "term_bank" in filename:
            filepath = os.path.join(JITENDEX_PATH, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                data = json.load(file)
                start_index = len(results)

                results.extend(data)

                end_index = len(results)
                cached_ranges[filename] = [start_index, end_index]
    return results, cached_ranges

data, cached_ranges = load_entire_dict()

In [33]:
def word_comparator(word_data1, word_data2):
    word1 = word_data1[1]
    word2 = word_data2[1]
    # Normalize katakana to hiragana
    def normalize(word):
        return ''.join(
            chr(ord(char) - 0x60) if 'ァ' <= char <= 'ン' else char
            for char in word
        )
    word1 = normalize(word1)
    word2 = normalize(word2)
    return (word1 > word2) - (word1 < word2)

In [34]:
%%time
sorted_dataset = sorted(data, key=functools.cmp_to_key(word_comparator))
len(sorted_dataset)

CPU times: total: 12.4 s
Wall time: 13.3 s


299921

In [35]:
# V2, handles data content glossary 
def extract_definitions(json_obj):
    """
    Extract definitions from a JSON structure where 'data -> content -> glossary' acts as the marker.

    Args:
        json_obj: The JSON object to search (can be a dict or list).

    Returns:
        A list of definitions found under the specified path.
    """
    definitions = []

    def recursive_search(obj):
        if isinstance(obj, dict):
            # Check if this dict contains the specific marker
            if (
                "data" in obj
                and isinstance(obj["data"], dict)
                and obj["data"].get("content") == "glossary"
                and "content" in obj
            ):
                # Extract the 'content' under this structure
                glossary_content = obj["content"]
                if isinstance(glossary_content, list):
                    # Collect all the 'content' fields from the glossary list
                    for item in glossary_content:
                        if isinstance(item, dict) and "content" in item:
                            definitions.append(item["content"])
                elif isinstance(glossary_content, dict):
                    # Single dictionary case (e.g., "welcome!")
                    if "content" in glossary_content:
                        definitions.append(glossary_content["content"])
            else:
                # Continue searching deeper in the dict
                for key, value in obj.items():
                    recursive_search(value)
        elif isinstance(obj, list):
            # Search each item in the list
            for item in obj:
                recursive_search(item)

    # Start the recursive search
    recursive_search(json_obj)
    return definitions

# redirect one.
def extract_redirect(json_obj):
    """
    # path for redirect -> 5, content, content (base), ⟶
    # field: (base), 1, content
    """
    # print(len(json_obj[5][0]['content']))
    # print(json_obj[5][0]['content'][0])
    # print(json_obj[5][0]['content'][1])
    if (isinstance(json_obj, list) and 
        isinstance(json_obj[5], list) and 
        isinstance(json_obj[5][0], dict) and 
        isinstance(json_obj[5][0]['content'], dict)):
        base = json_obj[5][0]['content']['content']
    else:
        # print("not based")
        return []
    if base[0] == "⟶":
        field_base = base[1]['content']
        if isinstance(field_base, list):
            return [field_base[0]]
        elif isinstance(field_base, str):
            return [field_base]
        elif isinstance(field_base, dict):
            return field_base['content'][0]
        else: 
            return []
    else: 
        
        return []
    
def extract_explanation(json_obj):
    """
    Only 39 left out of 300k. Not a priority.
    """
    pass

In [36]:
cached_index = defaultdict(list)

dup = 0
for i, word in enumerate(sorted_dataset):
    cached_index[word[0]].append(i)

print(sum(len(v) for k, v in cached_index.items()) == len(data))

CACHE_FILE = "my_cached_index.json"
with open(os.path.join(JITENDEX_PATH, CACHE_FILE), "w", encoding="utf-8") as json_file:
    json.dump(cached_index, json_file, indent=4)

# Read from file
with open(os.path.join(JITENDEX_PATH, CACHE_FILE), "r", encoding="utf-8") as json_file:
    cache = json.load(json_file)

print(sum(len(v) for k, v in cache.items()) == len(data))

True
True


generate cards

In [37]:
def generate_fields_with_jitendex(word, ds):
    """
    Get fields
    """
    readings = []
    
    if word not in cache:
        # Return none for now...
        # Could use binary search but there are problems
        # 1) have to check if the word is kana only
        # 2) words like そば could either be 傍　蕎麦　側
        return None
    else:
        for index in cache[word]:
            word_info = ds[index]
            yomigana = word_info[1]
            assert word == word_info[0]
            defs = extract_definitions(word_info)
            if defs is None:
                defs = extract_redirect(word_info)
            if defs is None:
                defs = extract_explanation(word_info)
            readings.append(f"{word}[{yomigana}]")
        
        furigana = " / ".join(readings)
        meaning = ", ".join(defs)

    return word, furigana, meaning

## anki connect modifications


C:\Users\Evan\AppData\Roaming\Anki2\addons21\2055492159\__init__.py

In [156]:
try:
    # ===================================================== (EBUI) THIS PART IS EDITED BY ME
    @util.api()
    def setSpecificValueOfCards(self, cards, keys, newValues, warning_check=True):
        """"""
        if not isinstance(cards, list) or not all(isinstance(card,int) for card in cards):
            print("card must be a list of ints")
            return False
        
        if not isinstance(keys, list) or not isinstance(newValues, list):
            print("keys and newValues have to be lists.")
            return False

        if len(newValues) != len(keys):
            print("Invalid list lengths.")
            return False

        for key in keys:
            if key in ["did", "id", "ivl", "lapses", "left", "mod", "nid",
                    "odid", "odue", "ord", "queue", "reps", "type", "usn"]:
                if warning_check is False:
                    return False

        result = []
        try:
            ankiCards = [self.getCard(card) for card in cards]
            for ankiCard in ankiCards:
                for i, key in enumerate(keys):
                    setattr(ankiCard, key, newValues[i])

            self.collection().update_cards(ankiCards, skip_undo_entry=True)
            result.append(True)
        except Exception as e:
            result.append([False, str(e)])
        return result

    # ===================================================== (EBUI) THIS PART IS EDITED BY ME
    # ===================================================== (EBUI) THIS PART IS EDITED BY ME
    def createNotesSkipChecks(self, notes, skip_checks) -> list[AddNoteRequest]:
        if len(notes) == 0: return []
        
        collection = self.collection()

        model = collection.models.by_name(notes[0]['modelName'])
        if model is None:
            raise Exception('model was not found: {}'.format(notes[0]['modelName']))

        deck = collection.decks.by_name(notes[0]['deckName'])
        if deck is None:
            raise Exception('deck was not found: {}'.format(notes[0]['deckName']))

        requests = []
        for note in notes:
            ankiNote = anki.notes.Note(collection, model)
            ankiNote.note_type()['did'] = deck['id']
            if 'tags' in note:
                ankiNote.tags = note['tags']

            for name, value in note['fields'].items():
                for ankiName in ankiNote.keys():
                    if name.lower() == ankiName.lower():
                        ankiNote[ankiName] = value
                        break

            self.addMediaFromNote(ankiNote, note)


            # CHECKS, 
            if skip_checks:
                requests.append(AddNoteRequest(ankiNote, deck['id']))
            else:
                allowDuplicate = False
                duplicateScope = None
                duplicateScopeDeckName = None
                duplicateScopeCheckChildren = False
                duplicateScopeCheckAllModels = False

                if 'options' in note:
                    options = note['options']
                    if 'allowDuplicate' in options:
                        allowDuplicate = options['allowDuplicate']
                        if type(allowDuplicate) is not bool:
                            continue
                    if 'duplicateScope' in options:
                        duplicateScope = options['duplicateScope']
                    if 'duplicateScopeOptions' in options:
                        duplicateScopeOptions = options['duplicateScopeOptions']
                        if 'deckName' in duplicateScopeOptions:
                            duplicateScopeDeckName = duplicateScopeOptions['deckName']
                        if 'checkChildren' in duplicateScopeOptions:
                            duplicateScopeCheckChildren = duplicateScopeOptions['checkChildren']
                            if type(duplicateScopeCheckChildren) is not bool:
                                continue
                        if 'checkAllModels' in duplicateScopeOptions:
                            duplicateScopeCheckAllModels = duplicateScopeOptions['checkAllModels']
                            if type(duplicateScopeCheckAllModels) is not bool:
                                continue

                duplicateOrEmpty = self.isNoteDuplicateOrEmptyInScope(
                    ankiNote,
                    deck,
                    collection,
                    duplicateScope,
                    duplicateScopeDeckName,
                    duplicateScopeCheckChildren,
                    duplicateScopeCheckAllModels
                )

                if duplicateOrEmpty == 1:
                    continue
                elif duplicateOrEmpty == 2:
                    if allowDuplicate:
                        requests.append(AddNoteRequest(ankiNote, deck['id']))
                    continue
                elif duplicateOrEmpty == 0:
                    requests.append(AddNoteRequest(ankiNote, deck['id']))
                else:
                    continue

        return requests

    @util.api()
    def addNotesSkipChecks(self, notes, skip_checks):
        """
        https://github.com/ankitects/anki/blob/9c3f89466d38d72daf8f671d6f3c8b18351bc642/pylib/anki/collection.py#L538
        """
        collection = self.collection()
        requests = self.createNotesSkipChecks(notes, skip_checks)
        self.collection().add_notes(requests)
        return len(requests)
    # ===================================================== (EBUI) THIS PART IS EDITED BY ME
except Exception:
    pass

## critical code

In [144]:
# DECK_NAME = "-exp::anki-connect-test"
DECK_NAME = "nhg::3vocab::auto-unsorted"
NOTE_MODEL_NAME = "2-Vocab"

def create_note(word, deck_name=DECK_NAME, generation_method="jitendex", tags=[], allow_same=True, force_generate=False):
    """
    Generalize creating a note

    Args:
        deck_name: name of deck to generate to
        generateion_method: jitendex or jisho
        tags: extra tags to tag the note with
        allow_same: is for same words with different kanji
        force_generate: generate cards that are not found
        
    Returns:
        return code
        -2 - Error
        -1 - Not Found
        0 - already in deck
        else: card_id integer
    """
    note = dict()
    note["deckName"] = deck_name
    note["modelName"] = NOTE_MODEL_NAME
    note["tags"] = ["autocreated"]

    """
    Fields to fill out: 
    Word
    Furigana
    Meaning
    """


    print("Creating card for:", word)
    fields = {}

    if generation_method == "jisho":
        raise NotImplementedError
    elif generation_method == "jitendex":
        word_info = generate_fields_with_jitendex(word, sorted_dataset)
    else:
        print("Generation method is not implemented")
        raise NotImplementedError

    # Fill in fields
    if word_info:
        fields["Word"] = word_info[0]
        fields["Furigana"] = word_info[1]
        fields["Meaning"] = word_info[2]
    elif word_info is None and force_generate == True:
        fields["Word"] = word
    else:
        return -1

    note["fields"] = fields
    # AnkiConnect request
    try:
        result = invoke("addNote", note=note)
        # ANKI API HAS add_notes 
        
    except Exception as e:
        # maybe we should utilize this and not get the card ID before hand 
        if str(e) == "cannot create note because it is a duplicate":
            print("Cannot create note because it is duplicate")
            return 0
        else:
            raise

    return result


In [152]:
def create_notes(words, deck_name=DECK_NAME, generation_method="jitendex", tags=[], allow_same=True, force_generate=False, skip_checks=False):
    """
    dupe checking doesn't actually take long so don't ever skip checks lol
    WARNING: CREATES DUPES if skip_checks == True

    Generalize creating a note

    Args:
        deck_name: name of deck to generate to
        generateion_method: jitendex or jisho
        tags: extra tags to tag the note with
        allow_same: is for same words with different kanji
        force_generate: generate cards that are not found
        
    Returns:
        return code
        -2 - Error
        -1 - Not Found
        0 - already in deck
        else: card_id integer
    """
    notes = []
    not_found = []

    for word in words:
        note = dict()
        note["deckName"] = deck_name
        note["modelName"] = NOTE_MODEL_NAME
        note["tags"] = ["autocreated"]

        """
        Fields to fill out: 
        Word
        Furigana
        Meaning
        """


        print("Creating card for:", word)
        fields = {}

        if generation_method == "jisho":
            raise NotImplementedError
        elif generation_method == "jitendex":
            word_info = generate_fields_with_jitendex(word, sorted_dataset)
        else:
            print("Generation method is not implemented")
            raise NotImplementedError

        # Fill in fields
        if word_info:
            fields["Word"] = word_info[0]
            fields["Furigana"] = word_info[1]
            fields["Meaning"] = word_info[2]
        elif word_info is None and force_generate == True:
            fields["Word"] = word
        else:
            not_found.append(word)
            continue

        note["fields"] = fields
        notes.append(note)

    # AnkiConnect request
    try:
        result = invoke("addNotesSkipChecks", notes=notes, skip_checks=skip_checks)
        # ANKI API HAS add_notes 
    except Exception as e:
        raise
        # maybe we should utilize this and not get the card ID before hand 
        if str(e) == "cannot create note because it is a duplicate":
            print("Cannot create note because it is duplicate")
            return 0
        else:
            raise

    return result, not_found


In [172]:
query = "deck:-exp::anki-connect-test Word:じ"
# search_result = invoke("findCards", query=query)
def get_word_card_ids(query):
    search_result = invoke("findCards", query=query)
    return search_result

def get_word_card_id(word, query = None):
    # print("Finding card id for:", word)
    # maybe we can improve the query?
    # https://github.com/ankidroid/Anki-Android/wiki/Database-Structure
    # https://docs.ankiweb.net/searching.html
    # query = f"deck:nhg::3vocab Word:{word}"
    if not query:
        query = f"deck:nhg::3vocab prop:ivl=0 Word:{word}"
    search_result = invoke("findCards", query=query)
    # Should be no dupes...
    # assert len(search_result) == 1
    if len(search_result) == 0:
        # print("Skipping")
        return None
    return search_result[0]

In [148]:
VOCAB_DECK_CREATION_DATE = date(2018, 5, 10)

def set_card_learned(card_id, ivl): # this is slightly inefficient if we are doing 1by1
    """
    https://git.foosoft.net/alex/anki-connect/src/commit/98e0bb35fb77fa3983347035f123065feb4c56e6/plugin/__init__.py
    Anki API:  
        self.collection().update_card(ankiCard, skip_undo_entry=True)

    https://github.com/ankitects/anki/blob/main/rslib/src/storage/card/update_card.sql
        update_card

        https://github.com/ankitects/anki/blob/main/rslib/src/card/mod.rs   
    """
    keys = ["ivl", "type", "queue", "due"]

    delta = abs(date.today() - VOCAB_DECK_CREATION_DATE)
    due = ivl + delta.days
    due_date = VOCAB_DECK_CREATION_DATE + timedelta(days=due)
    newValues = [ivl, 2, 2, due]
    # ivl   -> how long between last and current review
    #       -> changes to time from current review to future review
    # type  -> ???? 2 is the correct one tho
    # queue -> ???? 2 is the correct one tho
    # due   -> days FROM deck creation to be due on (0 is deck creation day)
    
    

    result = invoke("setSpecificValueOfCard", card=card_id, keys=keys, newValues=newValues, warning_check=True)
    if result and result[0]: 
        return due_date
    return None

In [150]:
def set_cards_learned(card_ids: list[int], ivl: int) -> list[object]:
    """
    After years of training.
    """
    keys = ["ivl", "type", "queue", "due"]
    delta = abs(date.today() - VOCAB_DECK_CREATION_DATE)
    due = ivl + delta.days
    due_date = VOCAB_DECK_CREATION_DATE + timedelta(days=due)
    newValues = [ivl, 2, 2, due]

    result = invoke("setSpecificValueOfCards", cards=card_ids, keys=keys, newValues=newValues, warning_check=True)
    if result and result[0]: 
        return due_date
    return None

In [151]:
SKIP_DECK = "nhg::3vocab::unneeded"

def auto_add(word):
    """
    Create and check if can be scheduled

    Args:
        word: word
        ivl: interval to set for known word
        force_schedule: If true, sets due date to interval even if it's already learned
        skip_schedule: Do not schedule, only create the card
        wait_for_type: wait for card to be fully created or skip it

    Returns:
        return code
        -1 or 
        or card_id
    """
    print(f"--START-- auto_add -- {word}")
    if not word:
        return -1
    
    card_id = get_word_card_id(word)
    if not card_id:
        card_id = create_note(word)
    if card_id in {-2, -1}:
        return -1
        
    print(card_id)
    return card_id


def schedule(cards, ivl, force_schedule=False, skip_schedule=False):
    print(f"===== Start at: {datetime.now()}")
    print(len(cards))
    # check_card_unlearned
    cards_to_set = []
    key_error = []
    skipped = []
    if not force_schedule:
        cards_info = invoke("cardsInfo", cards=cards)
        print(len(cards_info))
        print(f"===== Finished getting cards: {datetime.now()}")
        for card_info in cards_info:
            try:
                if card_info["deckName"] == SKIP_DECK:
                    # print("Skipping scheduling. Not needed")
                    skipped.append(card_info)
                    continue
                if card_info["type"] or card_info["queue"]:
                    # print("Skipping scheduling. Not a new card")
                    # print("Card Interval:", card_info["interval"])
                    skipped.append(card_info) # this has to be the card id
                    continue
                cards_to_set.append(card_info["cardId"])
            except KeyError as e:
                # This just doesn't work on retry. Maybe due to some lock?
                # print(f"Key 'type' doesn't exist yet. Skipping. ")
                """
                THIS IS WHY IT DOESN'T WORK, race against the +1
                # @deprecated(replaced_by=add_note)
                def addNote(self, note: Note) -> int:
                    self.add_note(note, note.note_type()["did"])
                    return len(note.cards())
                """
                key_error.append(card_info["cardId"])
                continue
    print(f"Cards to schedule: {len(cards_to_set)}")
    print(f"Skipped due to key: {len(key_error)}")
    print(f"Skipped else: {len(skipped)}")
    print(f"===== Now scheduling cards: {datetime.now()}")
    results = set_cards_learned(cards_to_set, ivl)
    print(f"===== Finished scheduling cards: {datetime.now()}")
    return results, key_error, skipped

## test new auto add -> then batch schedule

auto add one by one is terribly inefficient:
- get / findCards - not taking advantage of the duplicate checker
- insert / addNote - one by one means N commits


-> 32m 51s -> 1971s / 580w -> 3.4s (348 / 580 newly added -> 60%)

-> 580w (all already added) -> 525 in end result (55 not found) -> 2.08s

print progress instead of a comprehension

weird edge case -> incorrect because there was a newline in the field:
```
--START-- auto_add -- 要求
Finding card id for: 要求
Creating card for: 要求
Cannot create note because it is duplicate
0
```


after optimization: 580 -> (280 added) less than 3s

In [159]:
%%time
words = easy
print(len(words))
# card_ids = list(filter(lambda cid: cid > 0, [auto_add(word) for word in words]))
# should not rely on create_note to return the cid.
results, not_found = create_notes(words, skip_checks=False)
print(results, len(not_found))

580
Creating card for: 予
Creating card for: 伸
Creating card for: 称
Creating card for: 刷
Creating card for: 是
Creating card for: =
Creating card for: 読
Creating card for: €
Creating card for: 負
Creating card for: 語
Creating card for: 徹
Creating card for: ◆
Creating card for: 立
Creating card for: 異
Creating card for: ☑
Creating card for: 助
Creating card for: 回
Creating card for: 当
Creating card for: 后
Creating card for: 観
Creating card for: 臨
Creating card for: +
Creating card for: ･
Creating card for: ◇
Creating card for: 吹
Creating card for: 装
Creating card for: 〜
Creating card for: ©
Creating card for: 再
Creating card for: 問
Creating card for: 嬢
Creating card for: ×
Creating card for: 片
Creating card for: 素
Creating card for: 近
Creating card for: 室
Creating card for: 書
Creating card for: 館
Creating card for: 屋
Creating card for: 座
Creating card for: 聖
Creating card for: 栄
Creating card for: 童
Creating card for: ·
Creating card for: 礫
Creating card for: 誉
Creating card for: 秀
Creating 

get card ids:

2.1s per search ==> <3s for 500 checks

In [187]:
%%time
subset = words
# "deck:nhg::3vocab Word:木 OR Word:気"
# -> this is a valid one
wordstrs = ["".join(f"Word:{word}") for word in subset]
query = "deck:nhg::3vocab " + " OR ".join(wordstrs)
print(len(query), query)
card_ids = get_word_card_ids(query)
# card_ids = [get_word_card_id(word) for word in subset]
card_ids
# 10 words -> ~21s
# ~2.1s per search -> 500 cards would be ~18min
# collection().find_cards
# new method: -> 

6656 deck:nhg::3vocab Word:予 OR Word:伸 OR Word:称 OR Word:刷 OR Word:是 OR Word:= OR Word:読 OR Word:€ OR Word:負 OR Word:語 OR Word:徹 OR Word:◆ OR Word:立 OR Word:異 OR Word:☑ OR Word:助 OR Word:回 OR Word:当 OR Word:后 OR Word:観 OR Word:臨 OR Word:+ OR Word:･ OR Word:◇ OR Word:吹 OR Word:装 OR Word:〜 OR Word:© OR Word:再 OR Word:問 OR Word:嬢 OR Word:× OR Word:片 OR Word:素 OR Word:近 OR Word:室 OR Word:書 OR Word:館 OR Word:屋 OR Word:座 OR Word:聖 OR Word:栄 OR Word:童 OR Word:· OR Word:礫 OR Word:誉 OR Word:秀 OR Word:第 OR Word:腐 OR Word:超 OR Word:識 OR Word:% OR Word:師 OR Word:判 OR Word:官 OR Word:礼 OR Word:感 OR Word:― OR Word:優 OR Word:我 OR Word:丁 OR Word:延 OR Word:詬 OR Word:最 OR Word:頃 OR Word:Ő OR Word:譲 OR Word:貸 OR Word:隷 OR Word:職 OR Word:୪ OR Word:関 OR Word:肢 OR Word:定 OR Word:☐ OR Word:← OR Word:ㄖ OR Word:峰 OR Word:未 OR Word:少 OR Word:起 OR Word:代 OR Word:傍 OR Word:陣 OR Word:側 OR Word:全 OR Word:化 OR Word:並 OR Word:同 OR Word:✓ OR Word:業火 OR Word:悪鬼 OR Word:金貨 OR Word:転機 OR Word:素質 OR Word:ラ王 OR Word:要求 OR W

[1452224257092,
 1452224257602,
 1452224257966,
 1452224258132,
 1452224258610,
 1452224258672,
 1452224258696,
 1452224258702,
 1452224259022,
 1452224259860,
 1452224260154,
 1452224261436,
 1452224261510,
 1452224261630,
 1452224261946,
 1452224261994,
 1452224262000,
 1452224262004,
 1452224262142,
 1452224262398,
 1452224262544,
 1452224262884,
 1452224263308,
 1452224263590,
 1452224263746,
 1452224263800,
 1452224263840,
 1452224263938,
 1452224263950,
 1452224264788,
 1452224265220,
 1452224265550,
 1452224265634,
 1452224265868,
 1452224266160,
 1452224266258,
 1452224266408,
 1452224266420,
 1452224266632,
 1452224266782,
 1452224266874,
 1452224266887,
 1452224266999,
 1452224267050,
 1452224267070,
 1452224267122,
 1452224267131,
 1452224267132,
 1452224267155,
 1452224267189,
 1452224267244,
 1452224267385,
 1452224267460,
 1452224267633,
 1452224267661,
 1452224267773,
 1452224267950,
 1452224267977,
 1452224268040,
 1452224268072,
 1452224268235,
 1452224268695,
 1452224

schedule() -> literally only a few seconds

- 525 words (newly scheduled) -> less than 5s
- 525 words (skipped) -> less than 5s

In [119]:
%%time
len(card_ids)
# results, key_error, skipped = schedule(card_ids, 1)
results, len(key_error), len(skipped)

CPU times: total: 0 ns
Wall time: 0 ns


(datetime.date(2024, 12, 12), 0, 525)

- 580 words, 349 notes created, 55 not found, scheduled 525
- CPU times: total: 312 ms
- Wall time: 9.74 s

rerun for idempotency:
- CPU times: total: 109 ms
- Wall time: 9.34 s

In [189]:
%%time
#matomeru:

words = easy
print(len(words))
results, not_found = create_notes(words, skip_checks=False)
print("Create notes results: ", results, len(not_found))

subset = words
wordstrs = ["".join(f"Word:{word}") for word in subset]
query = "deck:nhg::3vocab " + " OR ".join(wordstrs)
print(len(query), query)
card_ids = get_word_card_ids(query)
print("Cards found: ", len(card_ids))

results, key_error, skipped = schedule(card_ids, 1)
results, len(key_error), len(skipped)

580
Creating card for: 予
Creating card for: 伸
Creating card for: 称
Creating card for: 刷
Creating card for: 是
Creating card for: =
Creating card for: 読
Creating card for: €
Creating card for: 負
Creating card for: 語
Creating card for: 徹
Creating card for: ◆
Creating card for: 立
Creating card for: 異
Creating card for: ☑
Creating card for: 助
Creating card for: 回
Creating card for: 当
Creating card for: 后
Creating card for: 観
Creating card for: 臨
Creating card for: +
Creating card for: ･
Creating card for: ◇
Creating card for: 吹
Creating card for: 装
Creating card for: 〜
Creating card for: ©
Creating card for: 再
Creating card for: 問
Creating card for: 嬢
Creating card for: ×
Creating card for: 片
Creating card for: 素
Creating card for: 近
Creating card for: 室
Creating card for: 書
Creating card for: 館
Creating card for: 屋
Creating card for: 座
Creating card for: 聖
Creating card for: 栄
Creating card for: 童
Creating card for: ·
Creating card for: 礫
Creating card for: 誉
Creating card for: 秀
Creating 

(datetime.date(2024, 12, 12), 0, 526)

In [128]:
SKIP_DECK = "nhg::3vocab::unneeded"

def auto_add_and_set(word, ivl, force_schedule=False, skip_schedule=False, wait_for_type=True):
    """
    Create and set schedule of a card.

    Args:
        word: word
        ivl: interval to set for known word
        force_schedule: If true, sets due date to interval even if it's already learned
        skip_schedule: Do not schedule, only create the card
        wait_for_type: wait for card to be fully created or skip it

    Returns:
        return code
        -3 key type doesn't exist yet
        -2 "Error"
        -1 "Not Found", 
        0 date given (success)
        1 skipping
        2 skipping scheduling - alr in deck
        3 skipping scheduling - in unneeded
    """
    print("--START-- auto_add_and_set -- ", word)
    if not word:
        return 1
    
    card_id = get_word_card_id(word)
    if not card_id:
        card_id = create_note(word)
    if card_id in {-2}:
        return -2
    elif card_id in {-1}:
        return -1
    elif card_id in {0}:
        return 2
        
    print(card_id)
    if skip_schedule:
        return 1
    # check_card_unlearned
    if not force_schedule:
        while True:
            try: 
                card_info = invoke("cardsInfo", cards=[card_id])
                if card_info[0]["deckName"] == SKIP_DECK:
                    print("Skipping scheduling. Not needed")
                    return 3
                if card_info[0]["type"] or card_info[0]["queue"]:
                    print("Skipping scheduling. Not a new card")
                    print("Card Interval:", card_info[0]["interval"])
                    return 2
            except KeyError as e:
                # This just doesn't work on retry. Maybe due to some lock?
                print(f"Key 'type' doesn't exist yet. Skipping.")
                return 1
            break

    result = set_card_learned(card_id, ivl)
    if result:
        return 0
    else:
        return -2

In [129]:
# -1: <3s
# 0: 6-15s -> maybe avg 8.4 seconds?
# 0: key type reschedule 6-7s
# 1: 6-7s
# 2: ~4.1s
auto_add_and_set("見識", 21, skip_schedule=False)

--START-- auto_add_and_set --  見識
Finding card id for: 見識
1733812573831
Skipping scheduling. Not a new card
Card Interval: 21


2

In [143]:
def batch_add(words, ivl=-1, skip_schedule=True, force_schedule=True):
    words_result_map = dict()
    for i in range(len(words)):
        print(i)
        word = words[i]
        try:
            result = auto_add_and_set(
                word, 
                ivl, 
                skip_schedule=skip_schedule,
                force_schedule=force_schedule,
            )
            words_result_map[word] = result
        except Exception as e:
            print(e)
            words_result_map[word] = -1
        finally:
            print(f"{word}: {words_result_map[word]}")
            print()
    return words_result_map

In [131]:
FILE_NAME = "parsed/metaphor.txt"
easy, low = analyze_list(FILE_NAME)

--
To learn:	 22 {'焚', '凱', '骸', '播', '叙', '醐', '譚', '斡', '猶', '冑', '鞍', '芳', '醇', '牽', '拙', '傑', '魁', '駿', '猊', '嗅', '燻', '醍'}
Learned:	 198 {'袋', '造', '人', '二', '暮', '隊', '率', '信', '口', '細', '声', '製', '気', '為', '店', '辻', '野', '国', '差', '咎', '賞', '約', '息', '郷', '濡', '露', '欺', '碑', '英', '生', '婦', '管', '服', '衣', '馬', '廷', '乱', '末', '石', '闘', '敬', '覇', '聞', '境', '標', '明', '血', '替', '砂', '伝', '抵', '件', '切', '的', '底', '民', '鳥', '荒', '目', '干', '働', '商', '並', '祈', '怪', '地', '練', '具', '兵', '産', '状', '理', '威', '介', '撃', '賢', '勲', '香', '体', '火', '根', '立', '上', '首', '制', '先', '叫', '愚', '迫', '配', '導', '小', '光', '名', '突', '子', '霊', '破', '勢', '憎', '鼓', '防', '条', '弄', '改', '輩', '硝', '亡', '神', '砲', '対', '統', '仲', '度', '夕', '領', '師', '湖', '塊', '装', '作', '肉', '者', '売', '辺', '皆', '手', '締', '臭', '安', '想', '静', '格', '命', '逃', '位', '宮', '評', '返', '心', '旋', '八', '寝', '蓄', '引', '品', '出', '面', '架', '償', '峰', '下', '矢', '肝', '雄', '銘', '価', '草', '百', '金', '路', '物', '辛', '覚', '害', '書', '外', '公', '長', '番', '板', '城

In [145]:
%%time
%%capture cap

ivl=7
results = batch_add(
    easy+low, 
    ivl=ivl, 
    skip_schedule=False, 
    force_schedule=False,
)



# -> save this to a file 
results
len(results)
# results_sorted = {y: x for x, y in results.items()}
inv_map = {}
for k, v in results.items():
    inv_map[v] = inv_map.get(v, []) + [k]

total = 0
for k, values in inv_map.items():
    total += len(values)
print(total)

RESULTS_PATH = "parsed"
RESULTS_FILE = "results.json"
with open(os.path.join(RESULTS_PATH, RESULTS_FILE), "w", encoding="utf-8") as json_file:
    json.dump(inv_map, json_file, indent=4, ensure_ascii=False)

CPU times: total: 422 ms
Wall time: 12min 20s


128 words with skip schedule (with 76 created):
- CPU times: total: 344 ms
- Wall time: 7min 13s
-> avg 3.38s per card

on same 128 cards -> schedule:
- CPU times: total: 422 ms
- Wall time: 12min 20s
-> avg 5.78s per card


DO NOT CAPTURE BUT LOG PROGRESS INSTEAD OF EACH INDIVIDUAL ONE
(by percentage)


Scheduling is the bottleneck. Would it be fine to have concurrent adds / edits to different cards?
    https://github.com/FooSoft/anki-connect/issues/2

# final step:
debug / confirm:
- words that didn't get added
- weird words that were added
- overlappping words effort (not duplicates, but extremely similar to another word)

optimize:
- card creation api (jisho)
- card creation itself
- scheduling: would it be faster to separate scheduling and card creation? for parallelizing
    - would it be ok to parallelize on the db for separate entries?

# finishing touches:
- create a repo for this -> on unit_test? should unit_test be our public github?
- FREE version will be Tesseract OCR only (paid is GCP + Tesseract)
- convert this ipynb to a script, with all the sub tools
- clean up dirs and make things usable (from perspective of a clueless user)
- super over engineered version (prioritized):
    - Create a setup guide
    - code review everything first
    - publish this to github - put a disclaimer saying it's very general
        - disable issues, comments, etc
    - QT or pillow screenshot area selector
    - polling version of the pipeline (screenshot by screenshot)
        - use separate clipboard than OS clipboard
    - parallelize all parts of the pipeline
        screenshots, 
    - web frontend (offline)
    - refactor for unit and integration testing

    - generalize all parts of the pipeline for different types of users
        - ocr engine
        - dictionary
        - anki decks
        - anki note type
    - make a frontend / backend and host parts of this
        - make a hub personal website to host ALL my personal projects
        - load balancer to keep people out
        - login with google only
    - make a desktop version
