In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from utils import read_conll, Parser

%load_ext autoreload
%autoreload 2

In [None]:
data_path = Path("../code/data")
df = pd.DataFrame(read_conll(data_path.joinpath("train.conll"), lowercase=True))[:1000]
df.head()

In [None]:
parser = Parser(df)
parser.vectorize(*df[:2].to_dict("records"))

In [None]:
# Load word vectors from file
word_vectors = {}
for line in open(data_path.joinpath("en-cw.txt")).readlines():
    word, *embedding = line.strip().split()
    word_vectors[word] = np.asarray(embedding, dtype="float32")

# Initialize embeddings matrix with random values
embeddings_matrix = np.asarray(np.random.normal(0, 0.9, (parser.n_tokens, 50)), dtype="float32")

# Map each token to its corresponding embedding vector
for token in parser.tok2id:
    i = parser.tok2id[token]
    if token in word_vectors:
        embeddings_matrix[i] = word_vectors[token]
    elif token.lower() in word_vectors:
        embeddings_matrix[i] = word_vectors[token.lower()]

sample_index = 11
sub_samples = 6

sample = {k: v[:sub_samples] for k, v in df.iloc[sample_index].to_dict().items() if isinstance(v, list)}
dev_set = parser.vectorize(sample)

sample = pd.DataFrame(df.iloc[sample_index, :4].to_list()).T[:sub_samples]
sample.columns = ["word", "pos", "head", "label"]
sample

In [None]:
sample_vector_df = pd.DataFrame(dev_set[0])
sample_vector_df["w"] = sample_vector_df.word.apply(parser.id2tok.get)
sample_vector_df["p"] = sample_vector_df.pos.apply(parser.id2tok.get)
sample_vector_df["l"] = sample_vector_df.label.apply(parser.id2tok.get)
sample_vector_df

In [None]:
instances = parser.create_instances(*dev_set)
print("number of instances:", len(instances))
for words, legal_labels, gold_t in instances:
    print("legal labels:", legal_labels)
    print("gold transition:", gold_t)
    print([parser.id2tok.get(w) for w in words])

In [None]:
sentence = " ".join(sample.word)
print(f"The steps for the sample: '{sentence}'")
pd.read_csv("create_instances_sample.csv")