### Setup
#### Imports

In [1]:
from collections import defaultdict
from gensim.models import FastText
from gensim.utils import simple_preprocess
import json
import numpy as np
from numpy.linalg import norm
import os
import pandas as pd
import re
import sys
from sklearn.manifold import TSNE
from tqdm import tqdm

# this is needed, cause notebooks not on same level as helpers
sys.path.insert(0, os.path.abspath(".."))  # assumes notebook is in {root}/notebooks/{fname}.ipynb

# autoreload imports within same session when rerunning cell
%load_ext autoreload

%autoreload 2
from helpers.data_fetchers import fetch_sl_stopwords
from helpers.nlp import read_corpus, \
        get_similar_words_fasttext, \
        get_topn_neighbors, jaccard, \
        generate_seed_words_from_stem, \
        fasttext_incr_train_and_predict, \
        compute_epochwise_jaccard_similarity

#### Constants

In [2]:
# CONSTANTS
VECTORIZED_CORPUS_PATH = "../output/paragraph_ft_vectors.json"
VECTORIZED_SHAME_WORDS_PATH = "../output/shame_ft_vectors.json"
ORIG_CORPUS_DIR = "../data/original_txt_corpus/paragraph"
ANNOTATED_CORPUS_PATH = "../annotations/ft_cos_sim_ann.json"
FT_MODEL_PATH = "../models/ft_word_embeddings/ft_300ep_original_txt_paragraph_remove_stopwords.model"
STOPWORDS_PATH = '../data/stopwords_sl.txt'

#### Precomputed vars

In [3]:
# Load stopwords
stopwords = fetch_sl_stopwords(STOPWORDS_PATH)

# Load FastText model
ft_model = FastText.load(FT_MODEL_PATH)

#### Helpers

In [4]:
def paragraph_to_vector(paragraph, model, stopwords):
    tokens = [w for w in simple_preprocess(paragraph, deacc=True) if w not in stopwords]
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if vectors:
        return np.mean(vectors, axis=0).tolist()
    else:
        return np.zeros(model.vector_size).tolist()

def cosine_sim(a, b):
    a = np.array(a)
    b = np.array(b)
    if norm(a) == 0 or norm(b) == 0:
        return 0.0
    return float(np.dot(a, b) / (norm(a) * norm(b)))


def vectorize_words(seed_words, model):
    vectors = [model.wv[w] for w in seed_words if w in model.wv]
    if not vectors:
        raise ValueError("None of the seed_words are in vocabulary!")
    return np.mean(vectors, axis=0).tolist()

### Step 1: Vectorize all novels with the FastText model we trained before
Check the notebook word_embeddings_train.ipynb

In [5]:

all_paragraphs = []
corpus_dir = ORIG_CORPUS_DIR
file_list = sorted(os.listdir(corpus_dir))
vector_path = VECTORIZED_CORPUS_PATH

for fname in tqdm(file_list, desc="Files"):
    doc_id = os.path.splitext(fname)[0]
    # First, count number of paragraphs for progress bar
    with open(os.path.join(corpus_dir, fname), "r", encoding="utf-8") as f:
        lines = f.readlines()
    for i, line in enumerate(tqdm(lines, desc=f"Paragraphs in {fname}", leave=False)):
        line = line.strip()
        vector = paragraph_to_vector(line, ft_model, stopwords)
        entry = {
            "doc_id": doc_id,
            "paragraph_id": i,
            "text": line,
            "vector": vector
        }
        all_paragraphs.append(entry)

# Save as JSON
os.makedirs(os.path.dirname(vector_path), exist_ok=True)
with open(vector_path, "w", encoding="utf-8") as f:
    json.dump(all_paragraphs, f, ensure_ascii=False, indent=2)

Files: 100%|██████████| 326/326 [00:48<00:00,  6.77it/s]


### Step 2: Compute Similarity to Shame Vector(s)

##### Compute shame vectors

In [6]:
# create a couple of vectors to start with
with open('../output/seed_words.txt', 'r') as f:
    stem_words = f.readlines()

stem_words = [x.replace(',', '').strip() for x in stem_words]
babel_words = ['sram', 'skesan', 'osramočen', 'ponižan', 'kazniv']
kontekstio_words = ['strah', 'groza', 'motilo', 'nerodno', 'bolelo', 'zaskrbelo']

shame_vectors = {
    "stem_sram": vectorize_words(stem_words, ft_model),
    "babel_words": vectorize_words(babel_words, ft_model),
    "kontekstio_words": vectorize_words(kontekstio_words, ft_model),
}

# Save for use in similarity computation
os.makedirs(os.path.dirname(VECTORIZED_SHAME_WORDS_PATH), exist_ok=True)
with open(VECTORIZED_SHAME_WORDS_PATH, "w", encoding="utf-8") as f:
    json.dump(shame_vectors, f, ensure_ascii=False, indent=2)

#### Annotate corpus

In [7]:
# Load paragraph vectors
with open(VECTORIZED_CORPUS_PATH, "r", encoding="utf-8") as f:
    paragraphs = json.load(f)

# Load shame vectors (can be multiple)
with open(VECTORIZED_SHAME_WORDS_PATH, "r", encoding="utf-8") as f:
    shame_vectors = json.load(f)  # dict: name -> vector



results = []

for para in tqdm(paragraphs, desc="Annotating paragraphs"):
    entry = {
        "doc_id": para["doc_id"],
        "paragraph_id": para["paragraph_id"],
        "text": para["text"]
    }
    for vec_name, shame_vec in shame_vectors.items():
        sim = cosine_sim(para["vector"], shame_vec)
        entry[f"cos_sim_{vec_name}"] = sim
    results.append(entry)

# Save annotation results
os.makedirs(os.path.dirname(ANNOTATED_CORPUS_PATH), exist_ok=True)
with open(ANNOTATED_CORPUS_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

Annotating paragraphs: 100%|██████████| 343653/343653 [00:22<00:00, 15565.34it/s]


### Step 3: Convert to main annotation (nested and enriched)

In [8]:
df = pd.read_csv('../corpus_metadata.csv')
meta_lookup = df.set_index("shame_id")[["author", "title"]].to_dict(orient="index")

results = defaultdict(lambda: {"author": None, "title": None, "paragraphs": {}})


for para in paragraphs:
    doc_id = para["doc_id"]
    paragraph_id = para["paragraph_id"]
    text = para["text"]
    
    # Set author/title from meta_lookup if not already
    if results[doc_id]["author"] is None:
        meta = meta_lookup.get(doc_id, {})
        results[doc_id]["author"] = meta.get("author", "")
        results[doc_id]["title"] = meta.get("title", "")
    
    # Build paragraph annotation
    para_entry = {"text": text}
    for vec_name, shame_vec in shame_vectors.items():
        sim = cosine_sim(para["vector"], shame_vec)
        para_entry[f"cos_sim_{vec_name}"] = sim

    # Store by paragraph id
    results[doc_id]["paragraphs"][paragraph_id] = para_entry

with open("../annotations/main_shame_annotations.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)