In [1]:
%matplotlib inline
import os
import pandas as pd
import cv2
import lxml.html
import numpy as np
from pprint import pprint

import matplotlib.pyplot as plt

In [2]:
def extract_selected_paths(paths_df):
    selected_paths = {}
    for i, row in paths_df.iterrows():
        word = row['word']
        path = eval(row['path'])
        score = row['score']

        selected_paths[word] = path
    return selected_paths

In [3]:
def extract_target_scores(filepath):
    # open as html
    with open(f"{filepath}/results.html", "r") as f:
        results_html_string = f.read()
        results_html = lxml.html.fromstring(results_html_string)

    # fetch scores
    scores = {}
    score_list = []
    rows = results_html.cssselect("div.points-result")[0]
    for row in rows:
        word = row.cssselect("div.word")[0].text_content()
        points = int(row.cssselect("div.points")[0].cssselect("div.left")[0].text_content())
        score_list.append((word, points))
        scores[word] = points
    
    return scores

In [4]:
class Cell:
    def __init__(self):
        self.value = 0
        self.char = ""
        self.bonus = ""
    
    def __str__(self):
        return f"{self.bonus} {self.char} {self.value}"
    
    def __repr__(self):
        return self.__str__()

In [5]:
def construct_matrix(matrix_df):
    cells = [Cell() for _ in range(16)]
    cells = np.array(cells).reshape((4, 4))
    
    for i, row in matrix_df.iterrows():
        index = row['index']
        index = eval(index)
        
        char = row['character']
        value = row['value']
        bonus = row['bonus']
        
        cell = cells[index]
        cell.char = char
        cell.value = value
        cell.bonus = bonus
    
    return cells

In [6]:
def extract_metadata(path, matrix):
    values = []
    bonuses = []
    
    for index in path:
        cell = matrix[index]
        values.append(cell.value)
        bonuses.append(cell.bonus)
    
    return values, bonuses

In [7]:
def get_training_data(selected_paths, scores, matrix):
    training_data = []

    for word, path in selected_paths.items():
        # word is invalid, ignore
        if word not in scores:
            continue

        target_score = scores[word]
        values, bonuses = extract_metadata(path, matrix)
        length = len(word)

        training_data.append((word, values, bonuses, target_score))
    
    return training_data

In [10]:
folder = 11
filepath = f"../assets/score_prediction/{folder}"

image = cv2.imread(f"{filepath}/sample.png")
paths_df = pd.read_csv(f"{filepath}/paths.csv", delim_whitespace=True)
matrix_df = pd.read_csv(f"{filepath}/matrix.csv", delim_whitespace=True)

selected_paths = extract_selected_paths(paths_df)
scores = extract_target_scores(filepath)
matrix = construct_matrix(matrix_df)

training_data = get_training_data(selected_paths, scores, matrix)
headers = ["word", "values", "bonuses", "target_score"]

df = pd.DataFrame(training_data, columns=headers)

In [11]:
output_filepath = "score_prediction/training_data.csv"

if not os.path.exists(output_filepath):
    df.to_csv(f"score_prediction/training_data.csv", sep=" ", index=False)
else:
    root_df = pd.read_csv(output_filepath, delim_whitespace=True)
    df["values"] = df["values"].astype(str)
    df["bonuses"] = df["bonuses"].astype(str)
    
    merged_df = pd.concat([root_df, df])
    merged_df.drop_duplicates(inplace=True)
    merged_df.reset_index(drop=True, inplace=True)
    merged_df.sort_values("target_score", ascending=False, inplace=True)
    merged_df = merged_df[merged_df['target_score'] != 0]
    print(merged_df.to_string())
    
    merged_df.to_csv(f"score_prediction/training_data.csv", sep=" ", index=False)

          word                    values                                      bonuses  target_score
1690    bestad        [3, 3, 1, 1, 1, 2]           ['3W', '3L', ' ', ' ', '2W', '2W']           143
0        razee          [1, 1, 10, 2, 1]                 [' ', ' ', '3W', '2L', '2W']            99
1        teaze          [1, 1, 1, 10, 2]                 [' ', '2W', ' ', '3W', '2L']            99
2          zee                [10, 2, 1]                           ['3W', '2L', '2W']            82
3        jatos           [8, 1, 1, 1, 1]                  [' ', ' ', ' ', '3W', '2W']            81
4      amatols     [1, 3, 1, 1, 1, 2, 1]       [' ', '2W', ' ', ' ', '3W', '2L', ' ']            74
5       amatol        [1, 3, 1, 1, 1, 2]            [' ', '2W', ' ', ' ', '3W', '2L']            65
6        james           [8, 1, 3, 1, 1]                  [' ', ' ', '2W', ' ', '2W']            65
7      santols     [1, 1, 1, 1, 1, 2, 1]       ['2W', ' ', ' ', ' ', '3W', '2L', ' ']            62
