## Import packages, indicate the /src location, retrieve the data, and prep the corpus for segmentation

In [None]:
# === Import
import pandas as pd
import re
import sys
import json
from pathlib import Path
import nltk

# === Download NLTK resources if missing ===
try: nltk.data.find("tokenizers/punkt")
except LookupError: nltk.download("punkt")
try: nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    try: nltk.download("punkt_tab")
    except Exception: pass

# === Define the path to the auxiliary modules ===
ROOT = Path.cwd().parent
SRC = (ROOT / "src").resolve()

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

# === import the module for discourse segmentation ===
from discourse.segment import init_embeddings, segment_text

In [None]:
# === Define the path to the data and the pattern for retrieval ==
HOME = Path.home()
DATA_DIR = (HOME / "My Drive" / "_VectorData" / "projects" / "identifying_depression_with_rst" / "data").resolve(strict=True)

# === Pattern ===
data_files_pattern = r"K.+\.csv"

# === Retrieve the data
find_files = DATA_DIR / "raw"

data = []

for item in find_files.iterdir():
   if item.is_file() and re.search(data_files_pattern, str(item)):
      set_name = re.search(data_files_pattern, str(item))
      data.append((set_name.group(0).lower().strip(".csv"), pd.read_csv(item)))


## Inititate the model to get embeddings from text segments & segment all the texts in the corpus

In [None]:
# === HF options
# model_name = "DeepPavlov/rubert-base-cased"
# model_name = "ai-forever/ruBert-base"
# model_name = "sberbank-ai/ruBert-large"

# === ST options
model_name = "sberbank-ai/sbert_large_nlu_ru" # fast and arguably equally good results as with HF models

init_embeddings(backend="st", model_name=model_name) # the default window size of 1 seems to yeild the best results

In [None]:
# === A helper to iterate over all the texts in a corpus dataframe
def segment_texts_in_corpus(corpus: pd.DataFrame) -> list[str]:

    segmented = []

    for text in corpus["text"].to_list():
        segmented.append(segment_text(text))

    return(segmented)

In [None]:
# === Segment all the texts in all the corpora
segmented_corpus = {}

for name, corpus in data:
    segmented_corpus[name] = segment_texts_in_corpus(corpus)

## 'Visual inspection' of the processed corpora

In [None]:
segmented_corpus.keys()

In [None]:
corpus_name = "ked"

In [None]:
segmented_corpus[corpus_name][0][0]

In [None]:
find_split_texts = [i for i in segmented_corpus[corpus_name] if len(i[0]) > 1]

In [None]:
len(find_split_texts)

In [None]:
find_split_texts[2][0]

In [None]:
find_split_texts[37][0]

## Saving the (segmented) corpus for downstream processing (with an RST parser)

In [None]:
save_files_path = DATA_DIR / "processed"
processed_data_file = save_files_path / "segmented_corpus.json"

with open(processed_data_file, "w") as file:
    json.dump(segmented_corpus, file, indent=4, ensure_ascii=False)