In [1]:
# Imports
import numpy as np
import pandas as pd
from tqdm import tqdm

from kiloword.utils import *
from kiloword.config import Config
from kiloword.visualisation import plot_features
from kiloword.dimension_reduction import *

In [2]:
data = pd.read_csv(Config().MNE_PATH / "KWORD_ERP_LEXICAL_DECISION_DGMH2015.csv")
list_words = pd.unique(data["WORD"])
list_electrodes = pd.unique(data["ELECNAME"])

grouped_data = data.groupby("WORD")
list_eegs = []
for word in list_words:
    da = grouped_data.get_group(word)
    da = da[~da['ELECNAME'].isin(["REJ1", "REJ2", "REJ3"])]#.drop(columns=['WORD#', 'WORD', 'ELEC#', 'ELECNAME'])
    d = da.drop(columns=['WORD#', 'WORD', 'ELEC#', 'ELECNAME']).to_numpy()
    list_eegs.append(d)
eegs = np.stack(list_eegs)
eegs.shape

(960, 29, 256)

In [None]:
### 2. Parse the Semantic labels

In [None]:
from kiloword.utils import parse_table_labels

LIST_LABELS = ["ENTERTAINMENT", "MONEY", "NATURE", "QUANTITY",
               "POLITICS", "RELIGION", "HOUSE", "MOVE", "SPORT",
               "JUSTICE", "INDUSTRY", "LANGUAGE", "FOOD", "MODE",
               "DEVICE", "FAMILY", "MUSIC", "CRIME", "CATASTROPHE",
               "ARMY", "TIME", "SCHOOL", "CLEANNESS", "DEATH",
               "GLORY", "BODY", "PEOPLE", "MEDICAL", "MATERIAL",
               "GOVERN", "SCIENCE", "PHILOSOPHY"]

LABELS_CSV = pd.read_csv(Config().MNE_PATH / "words_and_pos.csv")

labels_table = parse_table_labels(LABELS_CSV, LIST_LABELS)

In [None]:
### 3. load Bert Features

In [4]:
bert_features = np.load("/home/viki/Downloads/kiloword_trained_bert_features.npy")
random_bert_features = np.load("/home/viki/Downloads/kiloword_random_bert_features.npy")

In [None]:
# Make pairs
from itertools import combinations

def all_pairs(elements):
    return list(combinations(elements, 2))

list_paired_words = all_pairs(list_words)
list_paired_indices = all_pairs(range(len(list_words)))

In [None]:
### 4. Distances computation

In [None]:
from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance

def compute_all_dl_distance(list_pairs, normalize=True):
    distances = []
    for (word1, word2) in tqdm(list_pairs):
        if normalize:
            dist = normalized_damerau_levenshtein_distance(word1, word2)
        else:
            dist = damerau_levenshtein_distance(word1, word2)
        distances.append(dist)
    return distances