# Data Processing and Analysis 
### The first part of this notebook comes from QMSum (Zhong et al. 2021)
### The Data Analysis and Statistics section was created for the purpose of this disseratation

In [None]:
import builtins
import json

# read the dataset
# please enter the path of your data
def read_data(this_path, this_split):
    split = this_split
    data_path = this_path + split + '.jsonl'
    data = []
    with builtins.open(data_path) as f:
        for line in f:
            data.append(json.loads(line))
    n_meetings = len(data)
    print('Total {} meetings in the {} set.'.format(n_meetings, split))
    return data

path = "../Data/QMSum/data/ALL/jsonl/"
split = 'train'
data = read_data(path, split)

In [None]:
#%pip install nltk
from nltk import word_tokenize
# tokneize a sent
def tokenize(sent):
    separator = ' '
    tokens = separator.join(word_tokenize(sent.lower(), preserve_line=True))
    return tokens

In [None]:
# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace('{ vocalsound } ', '')
    text = text.replace('{ disfmarker } ', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{ pause } ', '')
    text = text.replace('{ nonvocalsound } ', '')
    text = text.replace('{ gap } ', '')
    return text

In [None]:
# From QMSum

# process data for BART
# the input of the model here is the entire content of the meeting
bart_data = []
for i in range(len(data)):
    # get meeting content
    src = []
    for k in range(len(data[i]['meeting_transcripts'])):
        cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
        cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
        src.append(cur_turn)
    src = ' '.join(src)
    for j in range(len(data[i]['general_query_list'])):
        cur = {}
        query = tokenize(data[i]['general_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['general_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data.append(cur)
    for j in range(len(data[i]['specific_query_list'])):
        cur = {}
        query = tokenize(data[i]['specific_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['specific_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data.append(cur)
        
print('Total {} query-summary pairs in the {} set'.format(len(bart_data), split))
print(bart_data[2])
with open('data/bart_' + split + '.jsonl', 'w') as f:
    for i in range(len(bart_data)):
        print(json.dumps(bart_data[i]), file=f)

In [None]:
# From QMSum

# process data for BART
# the input of the model here is the gold span corresponding to each query
bart_data_gold = []
for i in range(len(data)):
    # get meeting content
    entire_src = []
    for k in range(len(data[i]['meeting_transcripts'])):
        cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
        cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
        entire_src.append(cur_turn)
    entire_src = ' '.join(entire_src)
    for j in range(len(data[i]['general_query_list'])):
        cur = {}
        query = tokenize(data[i]['general_query_list'][j]['query'])
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + entire_src + ' </s>')
        target = tokenize(data[i]['general_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data_gold.append(cur)
    for j in range(len(data[i]['specific_query_list'])):
        cur = {}
        query = tokenize(data[i]['specific_query_list'][j]['query'])
        src = []
        # get the content in the gold span for each query
        for span in data[i]['specific_query_list'][j]['relevant_text_span']:
            assert len(span) == 2
            st, ed = int(span[0]), int(span[1])
            for k in range(st, ed + 1):
                cur_turn = data[i]['meeting_transcripts'][k]['speaker'].lower() + ': '
                cur_turn = cur_turn + tokenize(data[i]['meeting_transcripts'][k]['content'])
                src.append(cur_turn)
        src = ' '.join(src)
        cur['src'] = clean_data('<s> ' + query + ' </s> ' + src + ' </s>')
        target = tokenize(data[i]['specific_query_list'][j]['answer'])
        cur['tgt'] = target
        bart_data_gold.append(cur)
        
print('Total {} query-summary pairs in the {} set'.format(len(bart_data_gold), split))
print(bart_data_gold[2])
with open('data/bart_' + split + '._gold.jsonl', 'w') as f:
    for i in range(len(bart_data_gold)):
        print(json.dumps(bart_data_gold[i]), file=f)

## Data Analysis and Statistics

In [None]:
# analyse data 

#useful imports 
import numpy as np
import os
import nltk
import pandas as pd

# function for analysis of a single meeting transcript
def analyse_meeting(data):
    n_turns = len(data['meeting_transcripts'])
    speakers_list = []
    turn_len = []
    all_content_tokens = []
    for k in range(n_turns):
        cur_speaker = data['meeting_transcripts'][k]['speaker'].lower()
        if cur_speaker not in speakers_list:
            speakers_list.append(cur_speaker)
        tokenized_content = nltk.word_tokenize(tokenize(data['meeting_transcripts'][k]['content'].lower()))
        len((tokenized_content))
        turn_len.append(len((tokenized_content)))

        all_content_tokens += [w for w in clean_data(tokenized_content) ]
    n_speakers = len(speakers_list)
    avg_turn_len = np.mean(turn_len)
    n_tokens = len(all_content_tokens)
    n_types = len(set(all_content_tokens))
   # freq_dist = nltk.FreqDist(all_content_tokens)
   # print(n_turns, n_speakers, avg_turn_len, n_tokens, n_types)
    return pd.DataFrame.from_dict({'n_speakers':[n_speakers], 'n_turns':[n_turns], 'avg_turn_len':[avg_turn_len], 
                                    'n_tokens':[n_tokens], 'n_types':[n_types]})




# analyse meeting in a directory (Academic, Committee, Product)
def analyse_meeting_type(type):
    # create df to store information
    df = pd.DataFrame(columns=['n_speakers', 'n_turns', 'avg_turn_len', 'n_tokens', 'n_types'])
    indices = []
    # assign directory
    directory = '../Data/QMSum/data/' + type + '/all/' # if type AVENI, change directory to "../Data/Aveni/all/"
    # iterate over files in that directory
    for filename in os.listdir(directory):
        indices.append(filename)
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
        # print(f)
            with open(f, 'r') as f:
                data = json.load(f)
                new_row = analyse_meeting(data)
                df = pd.concat([df, new_row], ignore_index=True)
    df.index = indices

    print(type)
    print(df.head())
    print()
    # get average number of speakers per meeting
    print('average number of speakers per meeting: ', df['n_speakers'].mean())

    # get average number of turns per meeting 
    print('average number of turns per meeting: ', df['n_turns'].mean())

    # get average turn length (in tokens) 
    print('average turn length in tokens: ',  df['avg_turn_len'].mean())

    # get average number of tokens
    print('average number of tokens: ', df['n_tokens'].mean())

    # get average number of types
    print('average number of different tokens (types): ', df['n_types'].mean())
    print()


def analyse_summaries(type):
    df = pd.DataFrame(columns=['n_queries', 'sum_len'])
    indices = []
    # assign directory
    directory = '../Data/QMSum/data/' + type + '/all/' # if type AVENI, change directory to "../Data/Aveni/all/"
    # iterate over files in that directory
    for filename in os.listdir(directory):
        indices.append(filename)
        f = os.path.join(directory, filename)
        # checking if it is a file
        if os.path.isfile(f):
        # print(f)
            with open(f, 'r') as f:
                data = json.load(f)
                n_gen_queries = len(data['general_query_list']) 
                n_spec_queries = len(data['specific_query_list'])
                n_queries = n_gen_queries + n_spec_queries


                gen_sum_len = [len(nltk.word_tokenize(tokenize(data['general_query_list'][k]['answer']))) for k in range(n_gen_queries)]
                print(min(gen_sum_len))
                print(max(gen_sum_len))
                spec_sum_len = [len(nltk.word_tokenize(tokenize(data['specific_query_list'][k]['answer']))) for k in range(n_spec_queries)]
                print(min(spec_sum_len))
                print(max(spec_sum_len))
                sum_len = np.mean(gen_sum_len + spec_sum_len)

                new_row = pd.DataFrame.from_dict({'n_queries':[n_queries], 'sum_len':[sum_len]})
                df = pd.concat([df, new_row], ignore_index=True)           
    df.index = indices

    print(type)
    print(df.head())
    print()
    # get average number of queries per meeting
    print('average number of queries per meeting: ', df['n_queries'].mean())

    # get average length of summary
    print('average length of summary: ', df['sum_len'].mean())






In [None]:
analyse_meeting_type('Academic')
analyse_meeting_type('Committee')
analyse_meeting_type('Product')

analyse_meeting_type('AVENI')


analyse_summaries('AVENI')