## Import the Segmented Corpus

In [None]:
# If these packages are not installed:

# ! pip install git+https://github.com/iinemo/isanlp.git
# ! pip install isanlp_rst
# ! pip install hf_xet

In [None]:
# === Import
import pandas as pd
import re
import sys
import json
from pathlib import Path

import warnings
warnings.filterwarnings("ignore", message="`encoder_attention_mask` is deprecated")

# === Define the path to the auxiliary modules ===
ROOT = Path.cwd().parent
SRC = (ROOT / "src").resolve()

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

# === import the module for rst work === <--- this is not ready yet
# from discourse.rst import init_parser # this is not ready yet

In [None]:
# === Define the path to the data and the pattern for retrieval ==
HOME = Path.home()
DATA_DIR = (HOME / "My Drive" / "_VectorData" / "projects" / "identifying_depression_with_rst" / "data").resolve(strict=True)

corpus_path = DATA_DIR / "processed"
corpus_file = corpus_path / "segmented_corpora.json"

with open(corpus_file, "r") as file:
    corpora = json.load(file)

## Visual Inspection

In [None]:
# === A note on the structure of the resulting segmented corpora: ===
# Each separate corpus in the corpora is the value for the key indicating the name of this sepcific corpus (like 'ked' in this case)

corpora.keys()

In [None]:
# === Further down, the tree the structure is as follows: ===
# The value of the key is a list of 2 items
# Where each item is also a list
# The first list is the original text either as a single list item if it has not been split
# Or as several itmes, which are the resulting chunks of the splitting pipeline upstream
# The second list is made up of the sentences returned by the sentence tokenizer as list items

# So, pulling the text (or the resulting chunks) for the "ked" coprus looks something like this:

corpora["ked"][38][0]

In [None]:
corpora["ked"][37][0][0]

In [None]:
len(corpora["ked"])

## Run the RST Parser

In [None]:
# Using the parser from the notebook directly for now (while the rst module is WIP)
# This will later be handled by the rst.py module

from isanlp_rst.parser import Parser

model = 'tchewik/isanlp_rst_v3'
version = 'gumrrg'  # Choose from {'gumrrg', 'rstdt', 'rstreebank'}

parser = Parser(hf_model_name=model, hf_model_version=version)

In [None]:
# Put the target corpus as texts/chunks into a separate variable for easier navigation/iteration logic downstream

corpus_name = "ked"
corpus = [item[0] for item in corpora[corpus_name]] # grab only the texts/segments, not the texts as sentences

In [None]:
# === This works on the specific corpus from the corpora: ===

# The code expects the text either as one solid chunk or as two or more chunks
# in case the text has been segmented in the previous phase (segmentation)


def _as_segments(x):
    # normalize each item in corpus to a list[str] for cases where the structure of the corpus may be like
    # ["text1", ["seg1_of_text2", "seg2_of_text2"], "text3", ... ]
    # this is just an extra precaution
    
    if isinstance(x, str):
        return [x]
    return [s.strip() for s in x if isinstance(s, str) and s.strip()]

parsed_corpus = []
errors = []

for di, doc in enumerate(corpus):
    segments = _as_segments(doc)
    parsed_segments = []
    for si, seg in enumerate(segments):
        try:
            parsed_segments.append(parser(seg))
        except Exception as e:
            errors.append({"doc_index": di, "seg_index": si, "error": str(e)})
            parsed_segments.append(None)  # or skip; but keeping alignment helps
    parsed_corpus.append(parsed_segments)


In [None]:
len(parsed_corpus)

## NEXT:
* Walk the RST Trees and Extract the Data (this part is WIP)