# Basic Preprocessing

## Import packages, indicate the /src location, retrieve the data, and prep the corpus for segmentation

In [None]:
# ! pip install sentence_transformers

In [1]:
# === Import
import pandas as pd
import re
import sys
import json
from pathlib import Path
import nltk

# === Download NLTK resources if missing ===
try: nltk.data.find("tokenizers/punkt")
except LookupError: nltk.download("punkt")
try: nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    try: nltk.download("punkt_tab")
    except Exception: pass

# === Define the path to the auxiliary modules ===
ROOT = Path.cwd().parent
SRC = (ROOT / "src").resolve()

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

In [3]:
# === Define the path to the data and the pattern for retrieval ==
HOME = Path.home()
DATA_DIR = (HOME / "My Drive" / "_VectorData" / "projects" / "identifying_depression_with_rst" / "data").resolve(strict=True)

# === Pattern ===
data_files_pattern = r"K.+\.csv"

# === Retrieve the data
find_files = DATA_DIR / "raw"

data = []

for item in find_files.iterdir():
   if item.is_file() and re.search(data_files_pattern, str(item)):
      set_name = re.search(data_files_pattern, str(item))
      data.append((set_name.group(0).lower().strip(".csv"), pd.read_csv(item)))


## Coverting the Data into Corpora

In [None]:
# The structure of the corpora now is as follows:
# {"name_of_coprus": ['document_1', 'document_2'], ...}

processed_coprora = {}
for name, corpus in data:
   processed_coprora.setdefault(name, []).extend(corpus["text"].to_list())

# The database of diagnonses labels mimics the corpora structure
# e.g. {"name_of_coprus": ["diagnosis_1", "diagnonsis_2", "diagnosis_1"]}

diagnoses = {}
for name, corpus in data:
   diagnoses.setdefault(name, []).extend(corpus["group"].to_list())

# Segmenting the Documents (Only if Necessary)

## Inititate the model to get embeddings from text segments & segment all the texts in the corpus

In [None]:
# === import the module for discourse segmentation ===
import importlib
import discourse.segment as seg

# === disable tokenizer parallelism ===
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# In case we need to reload the module
# seg = importlib.reload(seg)    # e.g. if we change the _MAX_TOKENS value

In [None]:
# === HF options
# model_name = "DeepPavlov/rubert-base-cased"
# model_name = "ai-forever/ruBert-base"
# model_name = "sberbank-ai/ruBert-large"

# === ST options
model_name = "sberbank-ai/sbert_large_nlu_ru" # fast and arguably equally good results as with HF models

seg.init_embeddings(backend="st", model_name=model_name) # the default window size of 1 seems to yeild the best results

In [None]:
# Just to double check what budget we're working with
print("GLOBAL _MAX_TOKENS:", seg._MAX_TOKENS)
print("segment_text defaults:", seg.segment_text.__defaults__)

In [None]:
# === A helper to iterate over all the texts in a corpus (as a dataframe)
def segment_texts_in_corpus(corpus: pd.DataFrame) -> list[str]:

    segmented = []

    for text in corpus["text"].to_list():
        segmented.append(seg.segment_text(text))

    return(segmented)

In [None]:
# === Segment all the texts in all the corpora
segmented_corpora = {}

for name, corpus in data:
    segmented_corpora[name] = segment_texts_in_corpus(corpus)

## 'Visual inspection' of the processed corpora

In [None]:
# =========================================================
# A note on the structure of the resulting segmented corpus:
# ==========================================================
# Each of the sepate corpora is the value for the key indicating the name of this sepcific corpus (like 'ked' in this case)

segmented_corpora.keys()

In [None]:
# =========================================================
# A note on the structure of the resulting segmented corpus:
# ==========================================================

# Further down the tree the structure is as follows:
# The value of the key is a list of 2 items
# Where each item is also a list
# The first list -- which we currently need -- is the original text either as a single list item if it has not been split
# Or as several itmes, which are the resulting chunks of the splitting pipeline

# The second list is made up of the sentences returned by the sentence tokenizer as list items
# These are not needed now -- they are just nice to keep around

# So, pulling the text (or the resulting chunks) for the "ked" coprus looks something like this:

corpus_name = "ked"
segmented_corpora[corpus_name][106][0]

In [None]:
# To identify the texts that have been split we can see if the first list in the value of the key/corpus is made up of more than 1 item

find_split_texts = [i for i in segmented_corpora[corpus_name] if len(i[0]) > 1]
len(find_split_texts)

In [None]:
# More 'visual inspection'
find_split_texts[2][0]

# Saving the Results

## Saving the (segmented) corpus/corpora for downstream processing (with an RST parser)

In [22]:
# Which corpora needs to be saved to a JSON file for dowstream work: preprocessed or segmented

CORPORA = processed_coprora # {"coprus": [doc_1, doc_2, ....]}
# CORPORA = segmented_coprora # {"coprus": [[['doc_1'], ['doc_1_sent1', 'doc_1_sent2', ...]], [['doc_2_seg_1,'doc_2_seg_1,'], ['doc_2_sent1', 'doc_2_sent2', ...]], ....]}

In [23]:
save_files_path = DATA_DIR / "interim"
processed_data_file = save_files_path / "preprocesssed_corpora.json"

with open(processed_data_file, "w") as file:
    json.dump(CORPORA, file, indent=4, ensure_ascii=False)

In [24]:
# Save all the diagnonses labels into a separate file that mimics the corpora structure
# e.g. {"ked": ["diagnosis_1", "diagnonsis_2", "diagnosis_1"]}

save_files_path = DATA_DIR / "interim"
diagnoses_data = save_files_path / "all_diagnoses.json"

with open(diagnoses_data, "w") as file:
    json.dump(diagnoses, file, indent=4, ensure_ascii=False)