# Get descriptive statistics on EDUs with regards to their relation

Capture certain discourse related information as provided by the rsd file format and aggregate on relation basis.

Contents:
- 00) Setup
- 01) Get the file paths
- 02) Get data and collect EDU level information
- 03) Aggregate information on relation label level

## 0. Setup

### 0.a Load necessary libraries

In [None]:
import os
from nltk.corpus import stopwords
from collections import defaultdict
import pandas as pd

### 0.b Get stopwords

In [2]:
stop_words = set(stopwords.words('english'))

## 1. Get the file paths

In [3]:
def list_rsd_file_paths(directory): 
    
    try: 
        # Get all file names in the specified directory 
        file_names = os.listdir(directory) 

        # Make sure to only capture .rsd files
        rs4_file_paths = [directory + '/' + file for file in file_names if file.endswith('.rsd')] 

        return rs4_file_paths 

    # Error handling
    except Exception as e: 

        print(f'An error occurred: {e}') 

        return [] 


# Get all rsd files in the train + dev + test file folders
rsd_file_paths = list_rsd_file_paths('C:/Users/marco/OneDrive/24_25_WS/Discourse_modeling_and_processing/disco-project/data/train') + list_rsd_file_paths('C:/Users/marco/OneDrive/24_25_WS/Discourse_modeling_and_processing/disco-project/data/dev') + list_rsd_file_paths('C:/Users/marco/OneDrive/24_25_WS/Discourse_modeling_and_processing/disco-project/data/test')

## 2. Get data and collect EDU level information

### 2.a Go through all documents and extract relevant information on EDU level

In [16]:
edu_level_list = []

for file_path in rsd_file_paths:

    ids = []
    texts = []
    additionals = []
    parents = []
    relations = []
    markers = []

    with open(file_path, 'r', encoding='utf-8') as file: 
        for line in file:

            row = line.split('\t')

            try:
                ids.append(int(row[0]))
                texts.append(row[1])
                additionals.append(row[5])
                parents.append(int(row[6]))
                relations.append(row[7])
                markers.append(row[9])

            except IndexError: 
                print(f"Skipping row with insufficient columns in file: {file_path}")
                ids = ids[:len(markers)]
                texts = texts[:len(markers)]
                additionals = additionals[:len(markers)]
                parents = parents[:len(markers)]
                relations = relations[:len(markers)]

        
    for i in range(len(ids)):

        # Ignore the 'ROOT' case
        if parents[i] in ids: 

            # 0: Coarse label
            coarse_label = relations[i][:relations[i].rfind('-')] if relations[i].find('same') else relations[i][:-2]

            # 1: Fine grained label
            fine_grained_label = relations[i][:-2]

            # 2: EDU length
            token_count = len(texts[i].split(' '))

            # 3: Nuc-Sat relation (binary)
            nuc_sat_rel = 1 if relations[i][-1] == 'r' else 0
            
            # Discourse marker information
            edu_markers = markers[i].split(';') if markers[i].find('_') else []

            # 4: Any marker count
            edu_markers_all_count = len(edu_markers)
            # 5: Any marker binary
            edu_marked_all = 1 if edu_markers_all_count else 0

            # 6
            edu_markers_dm_count = len([marker for marker in edu_markers if marker[:marker.find('-')] == 'dm'])
            # 7
            edu_marked_dm = 1 if edu_markers_dm_count else 0

            # 8
            edu_markers_graphical_count = len([marker for marker in edu_markers if marker[:marker.find('-')] == 'graphical'])
            # 9
            edu_marked_graphical = 1 if edu_markers_graphical_count else 0

            # 10
            edu_markers_lexical_count = len([marker for marker in edu_markers if marker[:marker.find('-')] == 'lexical'])
            # 11
            edu_marked_lexical = 1 if edu_markers_lexical_count else 0

            # 12
            edu_markers_morphological_count = len([marker for marker in edu_markers if marker[:marker.find('-')] == 'morphological'])
            # 13
            edu_marked_morphological = 1 if edu_markers_morphological_count else 0

            # 14
            edu_markers_numerical_count = len([marker for marker in edu_markers if marker[:marker.find('-')] == 'numerical'])
            # 15
            edu_marked_numerical = 1 if edu_markers_numerical_count else 0

            # 16
            edu_markers_reference_count = len([marker for marker in edu_markers if marker[:marker.find('-')] == 'reference'])
            # 17
            edu_marked_reference = 1 if edu_markers_reference_count else 0

            # 18
            edu_markers_semantic_count = len([marker for marker in edu_markers if marker[:marker.find('-')] == 'semantic'])
            # 19
            edu_marked_semantic = 1 if edu_markers_semantic_count else 0 

            # 20
            edu_markers_syntactic_count = len([marker for marker in edu_markers if marker[:marker.find('-')] == 'syntactic'])
            # 21
            edu_marked_syntactic = 1 if edu_markers_syntactic_count else 0

            # Additional information
            additional_informations = additionals[i].split('|') if additionals[i].find('|') > 0 else []

            # Sentence type information
            stype = [info[info.find('=')+1:] for info in additional_informations if not info.find('stype')][0]

            # 22
            edu_stype_decl = 1 if stype == 'decl' else 0
            # 23
            edu_stype_frag = 1 if stype == 'frag' else 0
            # 24
            edu_stype_ger = 1 if stype == 'ger' else 0
            # 25
            edu_stype_imp = 1 if stype == 'imp' else 0
            # 26
            edu_stype_inf = 1 if stype == 'inf' else 0
            # 27
            edu_stype_intj = 1 if stype == 'intj' else 0
            # 28
            edu_stype_multiple = 1 if stype == 'multiple' else 0
            # 29
            edu_stype_other = 1 if stype == 'other' else 0
            # 30
            edu_stype_q = 1 if stype == 'q' else 0
            # 31
            edu_stype_sub = 1 if stype == 'sub' else 0
            # 32
            edu_stype_wh = 1 if stype == 'wh' else 0

            # EDU tense information
            edu_tense = [info[info.find('=')+1:] for info in additional_informations if not info.find('edu_tense')][0]

            # EDU tense coarse
            if edu_tense[:3] == 'Non':
                edu_tense_coarse = 'None'  
            elif edu_tense[:3] =='Mod':
                edu_tense_coarse = 'Modal'
            elif edu_tense[:3] =='Fut':
                edu_tense_coarse = 'Fut'
            else:
                edu_tense_coarse = edu_tense[:4]

            # 33
            edu_tense_none = 1 if edu_tense_coarse == 'None' else 0
            # 34
            edu_tense_modal = 1 if edu_tense_coarse == 'Modal' else 0
            # 35
            edu_tense_fut = 1 if edu_tense_coarse == 'Fut' else 0
            # 36
            edu_tense_pres = 1 if edu_tense_coarse == 'Pres' else 0
            # 37
            edu_tense_past = 1 if edu_tense_coarse == 'Past' else 0

            edu_level_list.append([
                coarse_label,
                fine_grained_label,
                token_count,
                edu_markers_all_count,
                edu_marked_all,
                edu_markers_dm_count,
                edu_marked_dm,
                edu_markers_graphical_count,
                edu_marked_graphical,
                edu_markers_lexical_count,
                edu_marked_lexical,
                edu_markers_morphological_count,
                edu_marked_morphological,
                edu_markers_numerical_count,
                edu_marked_numerical,
                edu_markers_reference_count,
                edu_marked_reference,
                edu_markers_semantic_count,
                edu_marked_semantic,
                edu_markers_syntactic_count,
                edu_marked_syntactic,
                edu_stype_decl,
                edu_stype_frag,
                edu_stype_ger,
                edu_stype_imp,
                edu_stype_inf,
                edu_stype_intj,
                edu_stype_multiple,
                edu_stype_other,
                edu_stype_q,
                edu_stype_sub,
                edu_stype_wh,
                edu_tense_none,
                edu_tense_modal,
                edu_tense_fut,
                edu_tense_pres,
                edu_tense_past
            ])


In [18]:
edu_level_df = pd.DataFrame(edu_level_list, columns= [
    "coarse_label",
    "fine_grained_label",
    "token_count",
    "markers_all_count",
    "marked_all",
    "markers_dm_count",
    "marked_dm",
    "markers_graphical_count",
    "marked_graphical",
    "markers_lexical_count",
    "marked_lexical",
    "markers_morphological_count",
    "marked_morphological",
    "markers_numerical_count",
    "marked_numerical",
    "markers_reference_count",
    "marked_reference",
    "markers_semantic_count",
    "marked_semantic",
    "markers_syntactic_count",
    "marked_syntactic",
    "stype_decl",
    "stype_frag",
    "stype_ger",
    "stype_imp",
    "stype_inf",
    "stype_intj",
    "stype_multiple",
    "stype_other",
    "stype_q",
    "stype_sub",
    "stype_wh",
    "tense_none",
    "tense_modal",
    "tense_fut",
    "tense_pres", 
    "tense_past"
    ])

edu_level_df

Unnamed: 0,coarse_label,fine_grained_label,token_count,markers_all_count,marked_all,markers_dm_count,marked_dm,markers_graphical_count,marked_graphical,markers_lexical_count,...,stype_multiple,stype_other,stype_q,stype_sub,stype_wh,tense_none,tense_modal,tense_fut,tense_pres,tense_past
0,organization,organization-heading,6,3,1,0,0,1,1,0,...,0,0,0,0,0,1,0,0,0,0
1,elaboration,elaboration-additional,5,1,1,0,0,1,1,0,...,0,0,0,0,0,1,0,0,0,0
2,attribution,attribution-positive,11,2,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,joint,joint-list,8,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,joint,joint-list,8,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29470,explanation,explanation-evidence,3,1,1,0,0,1,1,0,...,1,0,0,0,0,1,0,0,0,0
29471,elaboration,elaboration-additional,12,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
29472,elaboration,elaboration-additional,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
29473,explanation,explanation-justify,13,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


## 3. Aggregate information on relation label level

In [23]:
# Create a dictionary for aggregation and disregard the labels
agg_dict = {col: 'mean' for col in edu_level_df.columns if col not in ['coarse_label', 'fine_grained_label']}

# Group by labels and aggregate with mean
coarse_label_df = edu_level_df.groupby('coarse_label').agg(agg_dict)
fine_grained_label_df = edu_level_df.groupby('fine_grained_label').agg(agg_dict)

# Rename columns to include the suffix "_AVG" or "_PERC"
coarse_label_df.columns = [col + '_AVG' if col[-5:] == 'count' else col + '_PERC' for col in coarse_label_df.columns]
fine_grained_label_df.columns = [col + '_AVG' if col[-5:] == 'count' else col + '_PERC' for col in fine_grained_label_df.columns]

# Reset index to have 'Genre' as a column
coarse_label_df = coarse_label_df.reset_index()
fine_grained_label_df = fine_grained_label_df.reset_index()

# Save it
coarse_label_df.to_csv('results/coarse_label_df.csv', index=False)
fine_grained_label_df.to_csv('results/fine_grained_label_df.csv', index=False)

fine_grained_label_df


Unnamed: 0,fine_grained_label,token_count_AVG,markers_all_count_AVG,marked_all_PERC,markers_dm_count_AVG,marked_dm_PERC,markers_graphical_count_AVG,marked_graphical_PERC,markers_lexical_count_AVG,marked_lexical_PERC,...,stype_multiple_PERC,stype_other_PERC,stype_q_PERC,stype_sub_PERC,stype_wh_PERC,tense_none_PERC,tense_modal_PERC,tense_fut_PERC,tense_pres_PERC,tense_past_PERC
0,adversative-antithesis,8.005906,0.598425,0.507874,0.444882,0.427165,0.0,0.0,0.017717,0.017717,...,0.061024,0.015748,0.03937,0.080709,0.015748,0.490157,0.080709,0.023622,0.287402,0.11811
1,adversative-concession,8.282292,0.944792,0.8,0.73125,0.723958,0.0,0.0,0.145833,0.138542,...,0.110417,0.01875,0.00625,0.065625,0.005208,0.375,0.09375,0.021875,0.378125,0.13125
2,adversative-contrast,8.918584,0.784071,0.651327,0.60708,0.584071,0.0,0.0,0.010619,0.010619,...,0.069027,0.012389,0.00708,0.063717,0.00354,0.343363,0.084956,0.028319,0.376991,0.166372
3,attribution-negative,4.941748,4.223301,1.0,0.0,0.0,0.029126,0.029126,0.990291,0.951456,...,0.048544,0.009709,0.038835,0.038835,0.0,0.747573,0.07767,0.0,0.145631,0.029126
4,attribution-positive,4.32396,2.521383,1.0,0.003515,0.003515,0.16403,0.162273,0.957821,0.87522,...,0.048623,0.016989,0.015231,0.050381,0.011716,0.263035,0.03925,0.015817,0.394259,0.287639
5,causal-cause,8.685915,0.640845,0.61831,0.608451,0.592958,0.0,0.0,0.032394,0.032394,...,0.060563,0.023944,0.005634,0.038028,0.0,0.346479,0.033803,0.004225,0.352113,0.26338
6,causal-result,8.667286,0.663569,0.587361,0.468401,0.423792,0.0,0.0,0.04461,0.042751,...,0.048327,0.011152,0.005576,0.055762,0.0,0.328996,0.065056,0.042751,0.295539,0.267658
7,context-background,9.799259,1.165926,0.726667,0.016296,0.016296,0.068148,0.068148,0.06963,0.066667,...,0.034815,0.00963,0.005185,0.024444,0.001481,0.32,0.032593,0.01037,0.387407,0.24963
8,context-circumstance,7.26661,1.026405,0.844123,0.655026,0.650767,0.0,0.0,0.224872,0.190801,...,0.034072,0.021295,0.011925,0.039182,0.010221,0.425043,0.012777,0.004259,0.297274,0.260647
9,contingency-condition,8.02518,1.136691,0.96223,0.938849,0.926259,0.0,0.0,0.023381,0.023381,...,0.086331,0.030576,0.019784,0.217626,0.032374,0.273381,0.039568,0.0,0.593525,0.093525
