In [1]:
import os
import pandas as pd
import cv2
import lxml.html
import numpy as np
from pprint import pprint

In [2]:
import pickle

class Dictionary:
    def __init__(self, mapping={}):
        self.map = mapping

    def add_word(self, word):
        for i in range(len(word)):
            branch = word[:i]
            self.map.setdefault(branch, False)
        
        self.map[word] = True

    def remove_word(self, word):
        # if word not in, then do nothing
        if not word in self.map:
            return False
        
        is_word = self.map[word]
        # if branch, do nothing
        if not is_word:
            return False

        # if it is a word, check that it has no branches, and remove
        self.map[word] = False 
        for i in range(ord('a'), ord('z')+1):
            c = chr(i)
            branch = word+c
            if self.is_branch(branch):
                break
        # if no branches, then just remove
        else:
            self.map.pop(word)

    def is_branch(self, word):
        return word in self.map

    def is_word(self, word):
        return self.is_branch(word) and self.map[word] is True
    
class DictionarySerialiser:

    def save(self, dictionary, path):
        with open(path, 'wb') as f:
            # Pickle the 'data' dictionary using the highest protocol available.
            pickle.dump(dictionary.map, f, pickle.HIGHEST_PROTOCOL)
    
    def load(self, path):
        with open(path, 'rb') as f:
            mapping = pickle.load(f)
            dictionary = Dictionary(mapping)
            return dictionary

In [12]:
folder = 11
filepath = f"../assets/score_prediction/{folder}"

image = cv2.imread(f"{filepath}/sample.png")
paths_df = pd.read_csv(f"{filepath}/paths.csv", delim_whitespace=True)

In [13]:
selected_paths = {}
for i, row in paths_df.iterrows():
    word = row['word']
    path = eval(row['path'])
    score = row['score']
    
    selected_paths[word] = path

In [14]:
# open as html
with open(f"{filepath}/results.html", "r") as f:
    results_html_string = f.read()
    results_html = lxml.html.fromstring(results_html_string)
    
# fetch scores
scores = {}
score_list = []
rows = results_html.cssselect("div.points-result")[0]
for row in rows:
    word = row.cssselect("div.word")[0].text_content()
    points = int(row.cssselect("div.points")[0].cssselect("div.left")[0].text_content())
    score_list.append((word, points))
    scores[word] = points

In [15]:
invalid_words = []
missed_words = []

for word in scores:
    if word not in selected_paths:
        missed_words.append(word)

for word in selected_paths:
    # then word is invalid
    if word not in scores:
        invalid_words.append(word)
        continue
        
print(f"{len(invalid_words)} invalid words")
print(f"{len(missed_words)} missed words")

152 invalid words
20 missed words


In [16]:
serialiser = DictionarySerialiser()

In [17]:
dict_filepath = "../assets/dictionaries/dictionary.pickle"
dictionary = serialiser.load(dict_filepath)

In [18]:
removed_words = []
added_words = []

for word in invalid_words:
    if dictionary.is_word(word):
        removed_words.append(word)

for word in missed_words:
    if not dictionary.is_word(word):
        added_words.append(word)

print(f"Removed {len(removed_words)} words, Added {len(added_words)} words")

Removed 152 words, Added 20 words


In [19]:
for word in removed_words:
    dictionary.remove_word(word)
    
for word in added_words:
    dictionary.add_word(word)

In [20]:
serialiser.save(dictionary, dict_filepath)