# Clustering the genres into groups

This file explores different metrics on the genres of GUM and performs a cluster analysis on them to produce homogeneous groups according to the distribution of these attributes.

#### 00) Necessary libraries

In [1]:
import os
import csv

#### 01) Get the file paths

In [4]:
def list_rsd_file_paths(directory): 
    
    try: 
        # Get all file names in the specified directory 
        file_names = os.listdir(directory) 

        # Make sure to only capture .rsd files
        rs4_file_paths = [directory + '/' + file for file in file_names if file.endswith('.rsd')] 

        return rs4_file_paths 

    # Error handling
    except Exception as e: 

        print(f"An error occurred: {e}") 

        return [] 


# Get all rsd files in the all (train+dev+test) file folder
rsd_file_paths = list_rsd_file_paths("C:/Users/marco/OneDrive/24_25_WS/Discourse_modeling_and_processing/disco-project/data/all")

#### 02.a) Load data into genre-file dictionary

In [19]:
def rsd_file_paths_to_dict(rsd_file_paths):

    # Dictionary structure: 
    # Highest layer key = genre
    # Second layer key = document
    genre_document_dict = {
        'academic':{},
        'bio':{},
        'conversation':{},
        'court':{},
        'essay':{},
        'fiction':{},
        'interview':{},
        'letter':{},
        'news':{},
        'podcast':{},
        'reddit':{},
        'speech':{},
        'textbook':{},
        'vlog':{},
        'whow':{},
        'voyage':{}
    }

    for file_path in rsd_file_paths:

        ids = []
        texts = []
        syntactics = []
        parents = []
        relations = []
        markers = []

        with open(file_path, 'r', encoding='utf-8') as file: 
            reader = csv.reader(file, delimiter='\t') 
    
            for row in reader:
                try:
                    ids.append(int(row[0]))
                    texts.append(row[1])
                    syntactics.append(row[5])
                    parents.append(int(row[6]))
                    relations.append(row[7])
                    markers.append(row[9])
                except IndexError: 
                    print(f"Skipping row with insufficient columns in file: {file_path}")
                    ids = ids[:len(markers)]
                    texts = texts[:len(markers)]
                    syntactics = syntactics[:len(markers)]
                    parents = parents[:len(markers)]
                    relations = relations[:len(markers)]

        # For each EDU pair get:
        # 0: tokenized text of EDU pair with start, separation and end marker
        # 1: text of single EDU
        # 2: fine grained label
        # 3: coarse label
        # 4: total markers
        # 5: total markers counter
        # 6: dm markers counter
        # 7: graphical markers counter
        # 8: lexical markers counter
        # 9: morphological markers counter
        # 10: numerical markers counter
        # 11: reference markers counter
        # 12: semantic markers counter
        # 13: syntactic markers counter

        edu_pairs = []
        
        for i in range(len(ids)):

            # 0: tokenized text of EDU pair with start, separation and end marker
            if relations[i][-1] == 'r' and parents[i] in ids:
                edu_pair_text = ['<s>'] + texts[i].split(' ') + ['<sep>'] + texts[ids.index(parents[i])].split(' ') + ['<n>']
                
            elif relations[i][-1] == 'm' and parents[i] in ids:
                edu_pair_text = ['<n>'] + texts[i].split(' ') + ['<sep>'] + texts[ids.index(parents[i])].split(' ') + ['<n>']

            # 2: fine grained label
            fine_grained_label = relations[i][:-2]

            # 3: coarse label
            coarse_label = relations[i][:relations[i].rfind('-')] if relations[i].find('same') else relations[i][:-2]
  
            # 4-13: discourse marker information
            edu_markers = markers[i].split(";") if markers[i].find('_') else []
            edu_markers_dm = [marker for marker in edu_markers if marker[:marker.find('-')] == 'dm']
            edu_markers_graphical = [marker for marker in edu_markers if marker[:marker.find('-')] == 'graphical']
            edu_markers_lexical = [marker for marker in edu_markers if marker[:marker.find('-')] == 'lexical']
            edu_markers_morphological = [marker for marker in edu_markers if marker[:marker.find('-')] == 'morphological']
            edu_markers_numerical = [marker for marker in edu_markers if marker[:marker.find('-')] == 'numerical']
            edu_markers_reference = [marker for marker in edu_markers if marker[:marker.find('-')] == 'reference']
            edu_markers_semantic = [marker for marker in edu_markers if marker[:marker.find('-')] == 'semantic']
            edu_markers_syntactic = [marker for marker in edu_markers if marker[:marker.find('-')] == 'syntactic']
                
            edu_pairs.append([
                    edu_pair_text,
                    texts[i], # 1: text of single EDU
                    fine_grained_label,
                    coarse_label,
                    edu_markers,
                    len(edu_markers),
                    len(edu_markers_dm),
                    len(edu_markers_graphical),
                    len(edu_markers_lexical),
                    len(edu_markers_morphological),
                    len(edu_markers_numerical),
                    len(edu_markers_reference),
                    len(edu_markers_semantic),
                    len(edu_markers_syntactic),
                    syntactics[i]
                ])

        # Check whether it is a file path with directories or plain file names
        if file_path.find('/') >= 0:
            shortened_file_path = file_path[file_path.rfind('/')+1:]
            file_genre = shortened_file_path[shortened_file_path.find('_')+1:shortened_file_path.rfind('_')]
            file_document = shortened_file_path[shortened_file_path.rfind('_')+1:shortened_file_path.find('.')]
        else:
            file_genre = file_path[file_path.find('_')+1:file_path.rfind('_')]
            file_document = file_path[file_path.rfind('_')+1:file_path.find('.')]

        genre_document_dict[file_genre][file_document] = edu_pairs
            
    return genre_document_dict

genre_document_dict = rsd_file_paths_to_dict(rsd_file_paths)

Skipping row with insufficient columns in file: C:/Users/marco/OneDrive/24_25_WS/Discourse_modeling_and_processing/disco-project/data/all/GUM_bio_hadid.rsd


### 02.b) Compare fine grained and coarse labels

In [21]:
for genre in genre_document_dict.keys():
    for document in genre_document_dict[genre].keys():
        for edu_pair in genre_document_dict[genre][document]:
            #if edu_pair[1].find('restatement') == 0:
            print('---')
            print(edu_pair[2])
            print(edu_pair[3])

---
organization-heading
organization
---
elaboration-additional
elaboration
---
attribution-positive
attribution
---
joint-list
joint
---
joint-list
joint
---
joint-list
joint
---
organization-preparation
organization
---
joint-list
joint
---
joint-list
joint
---
explanation-justify
explanation
---
elaboration-attribute
elaboration
---
context-background
context
---
elaboration-attribute
elaboration
---
same-unit
same-unit
---
elaboration-attribute
elaboration
---
same-unit
same-unit
---
context-background
context
---
elaboration-attribute
elaboration
---
elaboration-additional
elaboration
---
elaboration-attribute
elaboration
---
adversative-concession
adversative
---
adversative-antithesis
adversative
---
context-background
context
---
elaboration-attribute
elaboration
---
same-unit
same-unit
---
elaboration-attribute
elaboration
---
purpose-goal
purpose
---
elaboration-additional
elaboration
---
elaboration-additional
elaboration
---
purpose-goal
purpose
---
mode-means
mode
---
sam

### 02.c) Explore discourse marker information

In [23]:
for document in genre_document_dict['essay'].keys():
    for edu_pair in genre_document_dict['essay'][document]:
        print("---")
        print(f"Total markers: {edu_pair[4]}")
        print(f"Total markers (#): {edu_pair[5]}")
        print(f"DM markers (#): {edu_pair[6]}")
        print(f"Graphical markers (#): {edu_pair[7]}")
        print(f"Lexical markers (#): {edu_pair[8]}")
        print(f"Morphological markers (#): {edu_pair[9]}")
        print(f"Numerical markers (#): {edu_pair[10]}")
        print(f"Reference markers (#): {edu_pair[11]}")
        print(f"Semantic markers (#): {edu_pair[12]}")
        print(f"Syntactic markers (#): {edu_pair[13]}")


        # 6: dm markers counter
        # 7: graphical markers counter
        # 8: lexical markers counter
        # 9: morphological markers counter
        # 10: numerical markers counter
        # 11: reference markers counter
        # 12: semantic markers counter
        # 13: syntactic markers counter

---
Total markers: ['graphical-layout-']
Total markers (#): 1
DM markers (#): 0
Graphical markers (#): 1
Lexical markers (#): 0
Morphological markers (#): 0
Numerical markers (#): 0
Reference markers (#): 0
Semantic markers (#): 0
Syntactic markers (#): 0
---
Total markers: ['semantic-attribution_source-8-9']
Total markers (#): 1
DM markers (#): 0
Graphical markers (#): 0
Lexical markers (#): 0
Morphological markers (#): 0
Numerical markers (#): 0
Reference markers (#): 0
Semantic markers (#): 1
Syntactic markers (#): 0
---
Total markers: []
Total markers (#): 0
DM markers (#): 0
Graphical markers (#): 0
Lexical markers (#): 0
Morphological markers (#): 0
Numerical markers (#): 0
Reference markers (#): 0
Semantic markers (#): 0
Syntactic markers (#): 0
---
Total markers: ['graphical-layout-']
Total markers (#): 1
DM markers (#): 0
Graphical markers (#): 1
Lexical markers (#): 0
Morphological markers (#): 0
Numerical markers (#): 0
Reference markers (#): 0
Semantic markers (#): 0
Syntac

In [19]:
# How did we identify the marker types

#marker_types = []
#for genre in genre_document_dict.keys():
#    for document in genre_document_dict[genre].keys():
#        for edu_pair in genre_document_dict[genre][document]:
#            markers = edu_pair[4].split(";") if edu_pair[4].find('_') else []
#            for marker in markers:
#                marker_types.append(marker[:marker.find('-')])

#set(marker_types)

{'dm',
 'graphical',
 'lexical',
 'morphological',
 'numerical',
 'reference',
 'semantic',
 'syntactic'}

### 02.d) Explore syntactic information

In [27]:
# How did we identify the syntactic information types

syntactic_information_types = []
for genre in genre_document_dict.keys():
    for document in genre_document_dict[genre].keys():
        for edu_pair in genre_document_dict[genre][document]:
            print("---")
            print(edu_pair[14])
            #syntactic_information = edu_pair[14].split("|") if edu_pair[14].find('_') else []


#set(marker_types)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



### 03) Preprocessing for clustering

Ideas for feature to cluster genres into groups:
(All features calculated per document and then averaged/aggregated to their genre)

- Average EDU  pair token count
- Average single EDU token count
- Percentage of tokens being stop words
- Token types / token count (you had a name for that, I forgot^^)
- Percentage of relations being of satellite-nucles structure
- Percentage of relations being discourse marked (binary whether there is at least one dm)
