# This notebook ensures that there are only two keyboard layouts in the dataset: `default` and `extra`. All instances of layouts with the same name are absolutely identical (not only in proportions, but also in scale)

#### Data description

Данные предоставляются в формате JSON на каждой строке следующего содержания:

*   word – таргет (слово, которое хотел ввести пользователь)
*   curve
    *   x – массив координат X
    *   y – массив координат Y
    *   t – массив временных меток (время в миллисекундах)
    *   grid – формат раскладким
        *   grid\_name – название раскладки (default или extra)
        *   width – ширина раскладки
        *   height – высота раскладки
        *   keys – массив клавиш
            *   label – символ клавиши
            *   hitbox – область нажатия
                *   x – координата X верхнего левого угла
                *   y – координата Y верхнего левого угла
                *   w – ширина области
                *   h – высота области

У клавиш `enter`, `space`, `globe`, `toNumberState`, `backspace` нет ключа 'label', но есть одноименный ключ 'action'

In [None]:
from typing import List, Set, Optional, Dict
from collections import defaultdict
import json
import os

from tqdm import tqdm

In [None]:
TRAIN_DATASET_PATH = "../../data/data_original/train.jsonl"
VALID_DATASET_PATH = "../../data/data_original/valid.jsonl"
TEST_DATASET_PATH = "../../data/data_original/test.jsonl"

In [None]:
with open(TRAIN_DATASET_PATH, 'rb') as f:
    n_train_lines = sum(1 for _ in f)
print(n_train_lines)
# result: 6000000
# takes 1 minute 15 seconds

In [None]:
N_TRAIN_LINES = 6_000_000
N_VAL_LINES = N_TEST_LINES = 10_000

# Посмотрим на данные, проверим одинаковость раскладок

Было проверено, что:
* В train ровно 6000000 примеров
* Все примеры имеют line_data['curve']['grid']['grid_name'] либо 'default', либо 'extra', другого точно нет
* Все раскладки с одним именем в точности одинаковы

Посмотрим на 10 первых свайпов


In [None]:
N = 5
lines = []
with open(TRAIN_DATASET_PATH, encoding="utf-8") as f:
    train_examples = [json.loads(f.readline()) for _ in range(N)]

curves = [train_example["curve"] for train_example in train_examples]

for curve in curves:
    assert len(curve["x"]) == len(curve["y"]) == len(curve["t"])

for curve in curves:
    assert len(curve["x"]) == len(curve["y"]) == len(curve["t"])
    assert set(curve.keys()) == set(['x', 'y', 't', 'grid'])
    print(f"x: {curve['x']}")
    print(f"y: {curve['y']}")
    print(f"t: {curve['t']}")
    print("---")

x: [306, 306, 307, 316, 337, 374, 440, 487, 548, 583, 619, 656, 688, 703, 725, 743, 755, 764, 771, 773, 770, 758, 707, 656, 626, 529, 470, 418, 394, 327, 327]
y: [398, 398, 398, 395, 391, 386, 389, 397, 410, 415, 410, 398, 383, 374, 353, 332, 316, 302, 286, 281, 272, 260, 235, 214, 202, 158, 127, 97, 82, 43, 43]
t: [0, 7, 24, 62, 64, 81, 108, 125, 142, 158, 175, 194, 208, 224, 241, 258, 275, 293, 308, 325, 341, 359, 375, 394, 411, 426, 443, 459, 475, 494, 499]
---
x: [454, 450, 447, 439, 435, 431, 428, 427, 426, 425, 424, 436, 457, 485, 510, 533, 555, 567, 574, 574, 549, 522, 504, 488, 473, 463, 459, 457, 449, 427, 395, 367, 338, 312, 290, 274, 266, 263, 262, 292, 346, 415, 504, 584, 675, 748, 810, 864, 899, 917, 921, 919, 887, 820, 737, 656, 573, 487, 401, 337, 285, 243, 213, 198, 189, 183, 183]
y: [249, 235, 221, 182, 158, 110, 65, 29, 9, 3, 0, 26, 71, 123, 174, 218, 252, 268, 272, 271, 230, 178, 129, 88, 60, 44, 40, 39, 41, 60, 95, 131, 167, 197, 218, 230, 234, 236, 236, 247, 258, 2

Посмотрим на имеющиеся типы кривых и их соотношение

In [None]:
grid_name_to_count = defaultdict(int)
grid_name_to_grid = dict

with open(TRAIN_DATASET_PATH, encoding="utf-8") as f:
    for line in tqdm(f, total = N_TRAIN_LINES):
        line_data = json.loads(line)
        grid_name_to_count[line_data['curve']['grid']['grid_name']] += 1

print(grid_name_to_count)

# result: defaultdict(<class 'int'>, {'default': 5626340, 'extra': 373660})

Было проверено, что есть ровно две раскладки. Распределение крайне неравномерное. Одинаковые раскладки абсолютно одинаковы

In [None]:
def compare_all_grids_same(datapaths: List[str],
                           gridnames: Set[str],
                           n_lines_list: List[Optional[int]] = None,
                           verbose_different: bool = True):
    
    grid_templates = {gridname: None for gridname in gridnames}
    grids_that_differ = {gridname: [] for gridname in gridnames}

    for datapath, n_lines in zip(datapaths, n_lines_list):
        with open(datapath, encoding="utf-8") as f:
            for i, line in tqdm(enumerate(f), total = n_lines):
                line_data = json.loads(line)

                grid = line_data['curve']['grid']
                g_name = grid['grid_name']

                if grid_templates[g_name] is None:
                    grid_templates[g_name] = grid

                # Equality of dicts in python checks that all 
                # keys and corresponding values match up. 
                # The check is reqursive. So basic `!=` operator is valid.
                if grid != grid_templates[g_name]:
                    grids_that_differ[g_name].append((datapath, i, grid))
                    if verbose_different:
                        print(f"Grid {g_name} differs in {datapath} at line {i}")
    
    return grids_that_differ

In [None]:
datapaths = [TRAIN_DATASET_PATH, VALID_DATASET_PATH, TEST_DATASET_PATH]

N_TRAIN_LINES = 6000000
gridnames = {'default', 'extra'}

grids_that_differ = compare_all_grids_same(datapaths,
                                           gridnames,
                                           n_lines_list = [N_TRAIN_LINES, None, None],
                                           verbose_different = True)

print(grids_that_differ)

# result: {'default': [], 'extra': []}

In [None]:
# similar check to compare_all_grids_same but faster

def compare_all_grids_same_using_strs(datapaths: List[str],
                                      end_strs: List[str],
                                      n_lines_list: List[int],
                                      verbose_different: bool = True):
    lines_with_different_grids = []
    for datapath, n_lines in zip(datapaths, n_lines_list):
        with open(datapath, encoding="utf-8") as f:
            for i, line in tqdm(enumerate(f), total = n_lines):
                if not (line.endswith(end_strs[0]) or line.endswith(end_strs[1])):
                    if verbose_different:
                        print(f"Grid differs in {datapath} at line {i}")
                        lines_with_different_grids.append((datapath, i, line))
    return lines_with_different_grids



datapaths = [TRAIN_DATASET_PATH, VALID_DATASET_PATH, TEST_DATASET_PATH]

N_TRAIN_LINES = 6000000

compare_all_grids_same_using_strs(
    datapaths,
    end_strs=[
        '"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":99,"h":154}},{"label":"ц","hitbox":{"x":98,"y":15,"w":99,"h":154}},{"label":"у","hitbox":{"x":196,"y":15,"w":100,"h":154}},{"label":"к","hitbox":{"x":295,"y":15,"w":99,"h":154}},{"label":"е","hitbox":{"x":393,"y":15,"w":99,"h":154}},{"label":"н","hitbox":{"x":491,"y":15,"w":99,"h":154}},{"label":"г","hitbox":{"x":589,"y":15,"w":99,"h":154}},{"label":"ш","hitbox":{"x":687,"y":15,"w":99,"h":154}},{"label":"щ","hitbox":{"x":785,"y":15,"w":100,"h":154}},{"label":"з","hitbox":{"x":884,"y":15,"w":99,"h":154}},{"label":"х","hitbox":{"x":982,"y":15,"w":98,"h":154}},{"label":"ф","hitbox":{"x":0,"y":169,"w":99,"h":154}},{"label":"ы","hitbox":{"x":98,"y":169,"w":99,"h":154}},{"label":"в","hitbox":{"x":196,"y":169,"w":100,"h":154}},{"label":"а","hitbox":{"x":295,"y":169,"w":99,"h":154}},{"label":"п","hitbox":{"x":393,"y":169,"w":99,"h":154}},{"label":"р","hitbox":{"x":491,"y":169,"w":99,"h":154}},{"label":"о","hitbox":{"x":589,"y":169,"w":99,"h":154}},{"label":"л","hitbox":{"x":687,"y":169,"w":99,"h":154}},{"label":"д","hitbox":{"x":785,"y":169,"w":100,"h":154}},{"label":"ж","hitbox":{"x":884,"y":169,"w":99,"h":154}},{"label":"э","hitbox":{"x":982,"y":169,"w":98,"h":154}},{"action":"shift","hitbox":{"x":0,"y":323,"w":120,"h":154}},{"label":"я","hitbox":{"x":119,"y":323,"w":94,"h":154}},{"label":"ч","hitbox":{"x":212,"y":323,"w":95,"h":154}},{"label":"с","hitbox":{"x":306,"y":323,"w":94,"h":154}},{"label":"м","hitbox":{"x":399,"y":323,"w":95,"h":154}},{"label":"и","hitbox":{"x":493,"y":323,"w":94,"h":154}},{"label":"т","hitbox":{"x":586,"y":323,"w":95,"h":154}},{"label":"ь","hitbox":{"x":680,"y":323,"w":94,"h":154}},{"label":"б","hitbox":{"x":773,"y":323,"w":95,"h":154}},{"label":"ю","hitbox":{"x":867,"y":323,"w":95,"h":154}},{"action":"backspace","hitbox":{"x":961,"y":323,"w":119,"h":154}},{"action":"toNumberState","hitbox":{"x":0,"y":477,"w":141,"h":154}},{"action":"globe","hitbox":{"x":140,"y":477,"w":120,"h":154}},{"label":",","hitbox":{"x":259,"y":477,"w":98,"h":154}},{"action":"space","hitbox":{"x":356,"y":477,"w":455,"h":154}},{"label":".","hitbox":{"x":810,"y":477,"w":98,"h":154}},{"action":"enter","hitbox":{"x":907,"y":477,"w":173,"h":154}}],"grid_name":"default"}}}\n',
        '"grid":{"width":1080,"height":667,"keys":[{"label":"й","hitbox":{"x":0,"y":15,"w":91,"h":154}},{"label":"ц","hitbox":{"x":90,"y":15,"w":91,"h":154}},{"label":"у","hitbox":{"x":180,"y":15,"w":91,"h":154}},{"label":"к","hitbox":{"x":270,"y":15,"w":91,"h":154}},{"label":"е","hitbox":{"x":360,"y":15,"w":91,"h":154}},{"label":"н","hitbox":{"x":450,"y":15,"w":91,"h":154}},{"label":"г","hitbox":{"x":540,"y":15,"w":91,"h":154}},{"label":"ш","hitbox":{"x":630,"y":15,"w":91,"h":154}},{"label":"щ","hitbox":{"x":720,"y":15,"w":91,"h":154}},{"label":"з","hitbox":{"x":810,"y":15,"w":91,"h":154}},{"label":"х","hitbox":{"x":900,"y":15,"w":91,"h":154}},{"label":"ё","hitbox":{"x":990,"y":15,"w":90,"h":154}},{"label":"ф","hitbox":{"x":0,"y":169,"w":91,"h":154}},{"label":"ы","hitbox":{"x":90,"y":169,"w":91,"h":154}},{"label":"в","hitbox":{"x":180,"y":169,"w":91,"h":154}},{"label":"а","hitbox":{"x":270,"y":169,"w":91,"h":154}},{"label":"п","hitbox":{"x":360,"y":169,"w":91,"h":154}},{"label":"р","hitbox":{"x":450,"y":169,"w":91,"h":154}},{"label":"о","hitbox":{"x":540,"y":169,"w":91,"h":154}},{"label":"л","hitbox":{"x":630,"y":169,"w":91,"h":154}},{"label":"д","hitbox":{"x":720,"y":169,"w":91,"h":154}},{"label":"ж","hitbox":{"x":810,"y":169,"w":91,"h":154}},{"label":"э","hitbox":{"x":900,"y":169,"w":91,"h":154}},{"label":"ъ","hitbox":{"x":990,"y":169,"w":90,"h":154}},{"action":"shift","hitbox":{"x":0,"y":323,"w":91,"h":154}},{"label":"я","hitbox":{"x":90,"y":323,"w":91,"h":154}},{"label":"ч","hitbox":{"x":180,"y":323,"w":91,"h":154}},{"label":"с","hitbox":{"x":270,"y":323,"w":91,"h":154}},{"label":"м","hitbox":{"x":360,"y":323,"w":91,"h":154}},{"label":"и","hitbox":{"x":450,"y":323,"w":91,"h":154}},{"label":"т","hitbox":{"x":540,"y":323,"w":91,"h":154}},{"label":"ь","hitbox":{"x":630,"y":323,"w":91,"h":154}},{"label":"б","hitbox":{"x":720,"y":323,"w":91,"h":154}},{"label":"ю","hitbox":{"x":810,"y":323,"w":91,"h":154}},{"label":"?","hitbox":{"x":900,"y":323,"w":91,"h":154}},{"action":"backspace","hitbox":{"x":990,"y":323,"w":90,"h":154}},{"action":"toNumberState","hitbox":{"x":0,"y":477,"w":141,"h":154}},{"action":"globe","hitbox":{"x":140,"y":477,"w":120,"h":154}},{"label":",","hitbox":{"x":259,"y":477,"w":98,"h":154}},{"action":"space","hitbox":{"x":356,"y":477,"w":455,"h":154}},{"label":".","hitbox":{"x":810,"y":477,"w":98,"h":154}},{"action":"enter","hitbox":{"x":907,"y":477,"w":173,"h":154}}],"grid_name":"extra"}}}\n'
    ],
    n_lines_list = [N_TRAIN_LINES, None, None],
)

# result: []

Помотрим на раскладки

In [None]:
def get_grids(grid_names: List[str], datapath: str) -> Dict[str, dict]:
    grid_name_to_grid = {gname: None for gname in grid_names}
    with open(datapath, encoding="utf-8") as f:
        for line in f:
            line_data = json.loads(line)
            grid = line_data['curve']['grid']
            grid_name_to_grid[grid['grid_name']] = grid
            if None not in grid_name_to_grid.values():
                return grid_name_to_grid
    return grid_name_to_grid

In [None]:
grid_names = ['default', 'extra']
grid_name_to_grid = get_grids(grid_names, TRAIN_DATASET_PATH)
for gname, grid in grid_name_to_grid.items():
    print(gname)
    print(grid)
    print()

default
{'width': 1080, 'height': 667, 'keys': [{'label': 'й', 'hitbox': {'x': 0, 'y': 15, 'w': 99, 'h': 154}}, {'label': 'ц', 'hitbox': {'x': 98, 'y': 15, 'w': 99, 'h': 154}}, {'label': 'у', 'hitbox': {'x': 196, 'y': 15, 'w': 100, 'h': 154}}, {'label': 'к', 'hitbox': {'x': 295, 'y': 15, 'w': 99, 'h': 154}}, {'label': 'е', 'hitbox': {'x': 393, 'y': 15, 'w': 99, 'h': 154}}, {'label': 'н', 'hitbox': {'x': 491, 'y': 15, 'w': 99, 'h': 154}}, {'label': 'г', 'hitbox': {'x': 589, 'y': 15, 'w': 99, 'h': 154}}, {'label': 'ш', 'hitbox': {'x': 687, 'y': 15, 'w': 99, 'h': 154}}, {'label': 'щ', 'hitbox': {'x': 785, 'y': 15, 'w': 100, 'h': 154}}, {'label': 'з', 'hitbox': {'x': 884, 'y': 15, 'w': 99, 'h': 154}}, {'label': 'х', 'hitbox': {'x': 982, 'y': 15, 'w': 98, 'h': 154}}, {'label': 'ф', 'hitbox': {'x': 0, 'y': 169, 'w': 99, 'h': 154}}, {'label': 'ы', 'hitbox': {'x': 98, 'y': 169, 'w': 99, 'h': 154}}, {'label': 'в', 'hitbox': {'x': 196, 'y': 169, 'w': 100, 'h': 154}}, {'label': 'а', 'hitbox': {'x