# Get the support of relations in train and dev and test data

This file explores how many occurrences of each relation (support) are in the train, dev and test data.

Contents:
- 00) Setup
- 01) Get the file paths
- 02) Data preprocessing function
- 03) Flatten data
- 04) Get support of labels in the flattened data
- 05) Genre document and edu pair count

## 0. Setup

In [4]:
import os
from nltk.probability import FreqDist
import pandas as pd

## 1. Get the file paths

In [5]:
def list_rsd_file_paths(directory): 
    
    try: 
        # Get all file names in the specified directory 
        file_names = os.listdir(directory) 

        # Make sure to only capture .rsd files
        rs4_file_paths = [directory + '/' + file for file in file_names if file.endswith('.rsd')] 

        return rs4_file_paths 

    # Error handling
    except Exception as e: 

        print(f'An error occurred: {e}') 

        return [] 

In [6]:
# Get all rsd files in the train & dev & test file folders
train_rsd_file_paths = list_rsd_file_paths('C:/Users/marco/OneDrive/24_25_WS/Discourse_modeling_and_processing/disco-project/data/train') 
dev_rsd_file_paths = list_rsd_file_paths('C:/Users/marco/OneDrive/24_25_WS/Discourse_modeling_and_processing/disco-project/data/dev')
test_rsd_file_paths = list_rsd_file_paths('C:/Users/marco/OneDrive/24_25_WS/Discourse_modeling_and_processing/disco-project/data/test')

## 2. Data preprocessing function

In [7]:
def rsd_file_paths_to_dict(rsd_file_paths, fine_grained=True):

    group_genre_file_dict = {
        'cluster0':{
            'bio':{},
            'fiction':{}
        },
        'cluster1':{
            'academic':{},
            'interview':{},
            'letter':{},
            'news':{},
            'speech':{},
            'textbook':{},
            'voyage':{}
        },
        'cluster2':{
            'conversation':{},
        },
        'cluster3':{
            'court':{},
            'essay':{},
            'podcast':{},
            'reddit':{},
            'vlog':{}
        },
        'cluster4':{
            'whow':{}
        }
    }

    labels = []

    for file_path in rsd_file_paths:

        ids = []
        texts = []
        parents = []
        relations = []

        with open(file_path, 'r', encoding='utf-8') as file: 
            for line in file:

                row = line.split('\t')

                try:
                    ids.append(int(row[0]))
                    texts.append(row[1])
                    parents.append(int(row[6]))
                    relations.append(row[7])

                except IndexError: 
                    print(f"Skipping row with insufficient columns in file: {file_path}")
                    ids = ids[:len(relations)]
                    texts = texts[:len(relations)]
                    parents = parents[:len(relations)]

        edu_pairs = []

        for i in range(len(ids)):
            if relations[i][-1] == 'r' and parents[i] in ids:

                edu_text = ['<s>'] + texts[i].split(' ') + ['<sep>'] + texts[ids.index(parents[i])].split(' ') + ['<n>']

                if fine_grained:
                    edu_pairs.append([edu_text, relations[i][:-2]])
                    labels.append(relations[i][:-2])
                else: # coarse
                    edu_pairs.append([edu_text, relations[i][:relations[i].rfind('-')] if relations[i].find('same') else relations[i][:-2]])
                    labels.append(relations[i][:relations[i].rfind('-')] if relations[i].find('same') else relations[i][:-2])

            elif relations[i][-1] == 'm' and parents[i] in ids:
                edu_text = ['<n>'] + texts[i].split(' ') + ['<sep>'] + texts[ids.index(parents[i])].split(' ') + ['<n>']

                if fine_grained:
                    edu_pairs.append([edu_text, relations[i][:-2]])
                    labels.append(relations[i][:-2])
                else: # coarse
                    edu_pairs.append([edu_text, relations[i][:relations[i].rfind('-')] if relations[i].find('same') else relations[i][:-2]])
                    labels.append(relations[i][:relations[i].rfind('-')] if relations[i].find('same') else relations[i][:-2])
                    
        if file_path.find('/') >= 0:
            shortened_file_path = file_path[file_path.rfind('/')+1:]
            file_genre = shortened_file_path[shortened_file_path.find('_')+1:shortened_file_path.rfind('_')]
            file_document = shortened_file_path[shortened_file_path.rfind('_')+1:shortened_file_path.find('.')]
        else:
            file_genre = file_path[file_path.find('_')+1:file_path.rfind('_')]
            file_document = file_path[file_path.rfind('_')+1:file_path.find('.')]


        if file_genre in ['bio', 'fiction']:
            group_genre_file_dict['cluster0'][file_genre][file_document] = edu_pairs
            
        elif file_genre in ['academic', 'interview', 'letter', 'news', 'speech', 'textbook', 'voyage']:
            group_genre_file_dict['cluster1'][file_genre][file_document] = edu_pairs

        elif file_genre in ['conversation']:
            group_genre_file_dict['cluster2'][file_genre][file_document] = edu_pairs

        elif file_genre in ['court', 'essay', 'podcast', 'reddit', 'vlog']:
            group_genre_file_dict['cluster3'][file_genre][file_document] = edu_pairs
        
        elif file_genre in ['whow']:
            group_genre_file_dict['cluster4'][file_genre][file_document] = edu_pairs

        else:
            print(f"{file_document} of {file_genre} could not be assigned to the output dictionary!")
       
    return group_genre_file_dict, set(labels)

In [8]:
# Get the preprocessed data for train & dev & test with 
# Coarse labels
train_coarse_labelled_data, _ = rsd_file_paths_to_dict(train_rsd_file_paths, fine_grained=False)
dev_coarse_labelled_data, _ = rsd_file_paths_to_dict(dev_rsd_file_paths, fine_grained=False)
test_coarse_labelled_data, _ = rsd_file_paths_to_dict(test_rsd_file_paths, fine_grained=False)

# Fine grained labels
train_fine_grained_labelled_data, _ = rsd_file_paths_to_dict(train_rsd_file_paths, fine_grained=True)
dev_fine_grained_labelled_data, _ = rsd_file_paths_to_dict(dev_rsd_file_paths, fine_grained=True)
test_fine_grained_labelled_data, _ = rsd_file_paths_to_dict(test_rsd_file_paths, fine_grained=True)

## 3. Flatten data

In [13]:
def flatten_data(group_genre_file_dict):

    # Flatten the data into a list of EDU pairs and relations
    edu_pairs_list = []
    for group in group_genre_file_dict.values():
        for genre in group.values():
            for document in genre.values():
                edu_pairs_list.extend(document)

    return edu_pairs_list

In [34]:
# Get the preprocessed data as a flat list of EDU pairs with relations
# Coarse labels
train_coarse_labelled_data_flat = flatten_data(train_coarse_labelled_data)
dev_coarse_labelled_data_flat = flatten_data(dev_coarse_labelled_data)
test_coarse_labelled_data_flat = flatten_data(test_coarse_labelled_data)

# Fine grained labels
train_fine_grained_labelled_data_flat = flatten_data(train_fine_grained_labelled_data)
dev_fine_grained_labelled_data_flat = flatten_data(dev_fine_grained_labelled_data)
test_fine_grained_labelled_data_flat = flatten_data(test_fine_grained_labelled_data)

## 4. Get support of labels in the flattened data

In [28]:
def get_relation_support(flattened_data, output_directory_file):
    
    # Create a frequency distribution
    fdist = FreqDist([relation for _, relation in flattened_data]) 

    # Convert the frequency distribution to a DataFrame
    df = pd.DataFrame(list(fdist.items()), columns=['Relation', 'Support'])

    # Sort the DataFrame alphabetically by the 'Relation' column
    df_sorted = df.sort_values(by='Relation')

    # Save the sorted DataFrame to a CSV file in the result folder
    df_sorted.to_csv(output_directory_file, index=False)

    print(df_sorted)
    

In [35]:
# Get the support of relations in the flattened data
# Coarse labels
print("Train data and coarse labels:")
get_relation_support(train_coarse_labelled_data_flat, 'results/train_coarse_labelled_support.csv')
print("")
print("Dev data and coarse labels:")
get_relation_support(dev_coarse_labelled_data_flat, 'results/dev_coarse_labelled_support.csv')
print("")
print("Test data and coarse labels:")
get_relation_support(test_coarse_labelled_data_flat, 'results/test_coarse_labelled_support.csv')
print("")

# Fine grained labels
print("Train data and fine grained labels:")
get_relation_support(train_fine_grained_labelled_data_flat, 'results/train_fine_grained_labelled_support.csv')
print("")
print("Dev data and fine grained labels:")
get_relation_support(dev_fine_grained_labelled_data_flat, 'results/dev_fine_grained_labelled_support.csv')
print("")
print("Test data and fine grained labels:")
get_relation_support(test_fine_grained_labelled_data_flat, 'results/test_fine_grained_labelled_support.csv')
print("")

Train data and coarse labels:
        Relation  Support
5    adversative     1541
13   attribution     1421
8         causal      954
4        context     1966
14   contingency      423
1    elaboration     4378
9     evaluation      904
7    explanation     1361
2          joint     4533
11          mode      407
0   organization     1715
6        purpose      787
10   restatement      756
3      same-unit     1222
12         topic      443

Dev data and coarse labels:
        Relation  Support
5    adversative      272
8    attribution      200
7         causal      171
3        context      270
14   contingency       65
4    elaboration      610
11    evaluation      150
2    explanation      235
1          joint      639
12          mode       67
0   organization      196
6        purpose      115
10   restatement      124
9      same-unit      179
13         topic       70

Test data and coarse labels:
        Relation  Support
10   adversative      220
12   attribution      189
5

## 5. Genre document and EDU pair count

Count the documents and EDU pairs per genre

In [20]:
# Define the genre statistics dictionaries with the desired key names
genre_stats_dict_train = {
    'academic': {'train: documents #': 0, 'train: relations #': 0},
    'bio': {'train: documents #': 0, 'train: relations #': 0},
    'conversation': {'train: documents #': 0, 'train: relations #': 0},
    'court': {'train: documents #': 0, 'train: relations #': 0},
    'essay': {'train: documents #': 0, 'train: relations #': 0},
    'fiction': {'train: documents #': 0, 'train: relations #': 0},
    'interview': {'train: documents #': 0, 'train: relations #': 0},
    'letter': {'train: documents #': 0, 'train: relations #': 0},
    'news': {'train: documents #': 0, 'train: relations #': 0},
    'podcast': {'train: documents #': 0, 'train: relations #': 0},
    'reddit': {'train: documents #': 0, 'train: relations #': 0},
    'speech': {'train: documents #': 0, 'train: relations #': 0},
    'textbook': {'train: documents #': 0, 'train: relations #': 0},
    'vlog': {'train: documents #': 0, 'train: relations #': 0},
    'voyage': {'train: documents #': 0, 'train: relations #': 0},
    'whow': {'train: documents #': 0, 'train: relations #': 0}
}

genre_stats_dict_dev = {
    'academic': {'dev: documents #': 0, 'dev: relations #': 0},
    'bio': {'dev: documents #': 0, 'dev: relations #': 0},
    'conversation': {'dev: documents #': 0, 'dev: relations #': 0},
    'court': {'dev: documents #': 0, 'dev: relations #': 0},
    'essay': {'dev: documents #': 0, 'dev: relations #': 0},
    'fiction': {'dev: documents #': 0, 'dev: relations #': 0},
    'interview': {'dev: documents #': 0, 'dev: relations #': 0},
    'letter': {'dev: documents #': 0, 'dev: relations #': 0},
    'news': {'dev: documents #': 0, 'dev: relations #': 0},
    'podcast': {'dev: documents #': 0, 'dev: relations #': 0},
    'reddit': {'dev: documents #': 0, 'dev: relations #': 0},
    'speech': {'dev: documents #': 0, 'dev: relations #': 0},
    'textbook': {'dev: documents #': 0, 'dev: relations #': 0},
    'vlog': {'dev: documents #': 0, 'dev: relations #': 0},
    'voyage': {'dev: documents #': 0, 'dev: relations #': 0},
    'whow': {'dev: documents #': 0, 'dev: relations #': 0}
}

genre_stats_dict_test = {
    'academic': {'test: documents #': 0, 'test: relations #': 0},
    'bio': {'test: documents #': 0, 'test: relations #': 0},
    'conversation': {'test: documents #': 0, 'test: relations #': 0},
    'court': {'test: documents #': 0, 'test: relations #': 0},
    'essay': {'test: documents #': 0, 'test: relations #': 0},
    'fiction': {'test: documents #': 0, 'test: relations #': 0},
    'interview': {'test: documents #': 0, 'test: relations #': 0},
    'letter': {'test: documents #': 0, 'test: relations #': 0},
    'news': {'test: documents #': 0, 'test: relations #': 0},
    'podcast': {'test: documents #': 0, 'test: relations #': 0},
    'reddit': {'test: documents #': 0, 'test: relations #': 0},
    'speech': {'test: documents #': 0, 'test: relations #': 0},
    'textbook': {'test: documents #': 0, 'test: relations #': 0},
    'vlog': {'test: documents #': 0, 'test: relations #': 0},
    'voyage': {'test: documents #': 0, 'test: relations #': 0},
    'whow': {'test: documents #': 0, 'test: relations #': 0}
}

# Update the genre statistics dictionaries based on the training data
for group in train_coarse_labelled_data.keys():
    for genre in train_coarse_labelled_data[group].keys():
        for document in train_coarse_labelled_data[group][genre].keys():
            genre_stats_dict_train[genre]['train: documents #'] += 1
            genre_stats_dict_train[genre]['train: relations #'] += len(train_coarse_labelled_data[group][genre][document])

# Update the genre statistics dictionaries based on the dev data
for group in dev_coarse_labelled_data.keys():
    for genre in dev_coarse_labelled_data[group].keys():
        for document in dev_coarse_labelled_data[group][genre].keys():
            genre_stats_dict_dev[genre]['dev: documents #'] += 1
            genre_stats_dict_dev[genre]['dev: relations #'] += len(dev_coarse_labelled_data[group][genre][document])

# Update the genre statistics dictionaries based on the test data
for group in test_coarse_labelled_data.keys():
    for genre in test_coarse_labelled_data[group].keys():
        for document in test_coarse_labelled_data[group][genre].keys():
            genre_stats_dict_test[genre]['test: documents #'] += 1
            genre_stats_dict_test[genre]['test: relations #'] += len(test_coarse_labelled_data[group][genre][document])


In [None]:
# Make dataframes out of the dictionaries
 
train_data_df = pd.DataFrame(genre_stats_dict_train).transpose().rename_axis('genre').reset_index()

dev_data_df = pd.DataFrame(genre_stats_dict_dev).transpose().rename_axis('genre').reset_index()

test_data_df = pd.DataFrame(genre_stats_dict_test).transpose().rename_axis('genre').reset_index()

In [25]:
# Merge the dataframes together
temp_df = pd.merge(train_data_df, dev_data_df, on='genre', how='outer')

data_document_relation_df = pd.merge(temp_df, test_data_df, on='genre', how='outer')

# Save the result
data_document_relation_df.to_csv("results/genre_document_relation_frequencies.csv", index=False)


In [24]:
data_document_relation_df

Unnamed: 0,genre,train: documents #,train: relations #,dev: documents #,dev: relations #,test: documents #,test: relations #
0,academic,14,1514,2,202,2,247
1,bio,16,1697,2,174,2,180
2,conversation,10,2075,2,430,2,340
3,court,4,656,1,128,1,94
4,essay,3,370,1,171,1,146
5,fiction,15,1969,2,224,2,262
6,interview,15,1998,2,186,2,207
7,letter,4,562,1,109,1,103
8,news,19,1356,2,202,2,198
9,podcast,3,517,1,186,1,109
