## Import the Segmented Corpus

In [None]:
# If these packages are not installed:

# ! pip install git+https://github.com/iinemo/isanlp.git
# ! pip install isanlp_rst
# ! pip install hf_xet

In [None]:
# === Import
# import pandas as pd
import sys
import json
from pathlib import Path
# from collections import Counter

import warnings
warnings.filterwarnings("ignore", message="`encoder_attention_mask` is deprecated")

# === Define the path to the auxiliary modules ===
ROOT = Path.cwd().parent
SRC = (ROOT / "src").resolve()

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

# === import the module for rst work === <--- this is not ready yet
import importlib
import discourse.rst as rst

In [None]:
# === Define the path to the data and the pattern for retrieval ==
HOME = Path.home()
DATA_DIR = (HOME / "My Drive" / "_VectorData" / "projects" / "identifying_depression_with_rst" / "data").resolve(strict=True)

corpus_path = DATA_DIR / "interim"
corpus_file = corpus_path / "preprocesssed_corpora.json"

with open(corpus_file, "r") as file:
    corpora = json.load(file)

diagnoses_path = DATA_DIR / "interim"
diagnoses_file = diagnoses_path / "all_diagnoses.json"

with open(diagnoses_file, "r") as file:
    diagnoses = json.load(file)

In [None]:
# === This is just in case for possible debugging (with verbose output) ===
# import transformers
# transformers.utils.logging.set_verbosity_info()
# transformers.utils.logging.enable_explicit_format()

## Visual Inspection

## If the documents have only been preprocessed

In [None]:
# === A note on the structure of the resulting segmented corpora: ===
# Each separate corpus in the corpora is the value for the key indicating the name of this sepcific corpus (like 'ked' in this case)

corpora.keys()

In [None]:
# The corpus then is a list where each document is its item
# This gets us the first document
corpora["ked"][0]

## If the documents have also been segmented

In [None]:
# === Further down, the tree the structure is as follows: ===
# The value of the key is a list of 2 items
# Where each item is also a list
# The first list is the original text either as a single list item if it has not been split
# Or as several itmes, which are the resulting chunks of the splitting pipeline upstream
# The second list is made up of the sentences returned by the sentence tokenizer as list items

# So, pulling the text (or the resulting chunks) for the "ked" coprus looks something like this:

corpora["ked"][0][0]

In [None]:
len(corpora["ked"])

## Prep Everything for Running the RST Parser

In [None]:
# In case we need to reload the module
rst = importlib.reload(rst)

In [None]:
# === Initialize the Parser
# model = 'tchewik/isanlp_rst_v3'
# version = 'rstreebank'  # Choose from {'gumrrg', 'rstdt', 'rstreebank'}

rst.init_parser()

In [None]:
# Put the target corpus as texts/chunks into a separate variable for easier navigation/iteration logic downstream

CORPUS_NAME = "ked"
corpus = corpora[CORPUS_NAME]

# corpus = [item[0] for item in corpora[CORPUS_NAME]] # if the texts have been segemented, grab only the texts/segments, not the texts as sentences

In [None]:
# Double-check the strucutre of the corpus is what the parser would expect

corpus[:5]

## Test-parsing one selected corpus

In [None]:
# The code will accept a corpus of any (reasonable) structure
# The default structure is ['doc_1', 'doc_2', 'doc_3', ... ]
# There is a safeguard to normalize each item in corpus to a list[str],
# So the above will be converted into [['doc_1'], ['doc_2'], ['doc_3'], ... ]

# Hence, the code also works with a coprus as a list of items, where each item is also a list
# containing the whole document as one list item or the doucment in two or more chunks (as list items)
# e.g. [['doc_1'], ['seg1_of_doc_2', 'seg2_of_doc_2'], ['doc_3'] ... ]

# Or a mixed structure like that:
# ["doc_1", ["seg1_of_doc_2", "seg2_of_doc_2"], "doc_3", ... ]

parsed_corpus = rst.parse_corpus(corpus)

## A Bit of Visual Inspection

In [None]:
# === See if any errors have been logged
parsed_corpus[1]

In [None]:
# Get rid of the errors log and keep the parsed corpus only
parsed_corpus = parsed_corpus[0]

# Check the number of items in the corpus
# Shoud match the number of documents (segmented or otherwise) in the initial corpus
len(parsed_corpus)

In [None]:
# === What does the parser return and how is it structured in the output here

# First, each item a dictionary returned by the parser wrapped into a list:
# This is a legacy feature so that the marginal cases of segemeted documents could also be handled
parsed_corpus[:5]

In [None]:
# Further down, each such dictionary has the key "rst", which stores the results of parsing as a list of one item
# This item is the RST object/tree proper

parsed_corpus[0][0]

In [None]:
# It can be explored using the 'vars' function:

vars(parsed_corpus[0][0]["rst"][0])

## Extract the Features

In [None]:
# Run the main function to extract all the RST features

rst_res = rst.extract_all_rst_features(parsed_corpus)

In [None]:
# The function returns a tuple where the first item is the list where each item is a dict of RST features (one for each text)

all_features = rst_res[0]
len(all_features)

In [None]:
# The second item in the tuple is a list (a set) of all the relations that the parser identified in the coprus
all_realtions = rst_res[1]
len(all_realtions)

In [None]:
# Visual inspection
all_realtions

In [None]:
all_features[:2]

## Follow-up (in the Notebook for Now, in the Module Later)

### (Explicitly) Transform the Diagnosis into Labels

In [None]:
# The diagnonses database is structured similar to the corpora
# It's a dictionary with keys for the names of corpora and the diagnoses are lists of string items

# Interate over the database and get only unique labels used
labels = {}
for k, v in diagnoses.items():
    labels.setdefault(k, []).extend(set(v))

In [None]:
labels

In [None]:
# Set up the mapping for encoding the labels
# 1 for positive diagnonses (like anxiety, depression), 0 for negative

map_ = {'высокая депрессивность': 1,
        'нет депрессивности': 0,
        'низкая депрессивность': 0,
        'здоровые': 0,
        'депрессия': 1,
        'высокая тревожность': 1,
        'нет тревожности': 0,
        'низкая тревожность': 0}

In [None]:
# Make a y set with all the encoded labales for the corpora

y_all = {}

for k, v in diagnoses.items():
    y_all[k] = [map_.get(s, None) for s in v]

In [None]:
# Double check the mapping is correct

y_all["kldd"]

In [None]:
# Must be int

type(y_all["kldd"][0])

In [None]:
# Double check the mapping is correct

diagnoses["kldd"]

## Visaul Inspection of the Most Obvious Differences in Terms of Relation Counts and Proportions

In [None]:
all_features_pos = []
all_features_neg = []

for idx, val in enumerate(y_all[CORPUS_NAME]):
    if val == 0:
        all_features_neg.append(all_features[idx])
    elif val == 1:
        all_features_pos.append(all_features[idx])
    else:
        print("Something is wrong, the document/features did not match any label")

In [None]:
all_features_neg[:2]

In [None]:
relations_pos = rst.count_relations(all_features_pos)
relations_neg = rst.count_relations(all_features_neg)

In [None]:
relations_pos[1]

In [None]:
relations_neg[1]

## Get all the values for a certain relation type in a (subcorpus)

In [None]:
causal_rel_props_pos = [item['relation_proportions'].get("causal", 0.0) for item in all_features_pos]
causal_rel_props_neg = [item['relation_proportions'].get("causal", 0.0) for item in all_features_neg]