This notebook does the preprocessing on the data that is common to all the models. This is just basic processing including removal of non-latin characters and splitting the data by label. This notebook was originally run in kaggle, where the source data is held, and produces the file 'ted_training_pairs_NLTK2.csv' found in the Drive.

In [0]:
# importing pandas module  
import pandas as pd 

# reading csv file from url  
data = pd.read_csv("/kaggle/input/ted-talks/transcripts.csv")
   
# dropping null value columns to avoid errors 
data.dropna(inplace = True) 
  
# df display 
data

import re
import nltk

In [0]:
print(len(data))

2467


In [0]:
data.transcript.str.count("(Laughter)").sum()

10251

In [0]:
import random
utterances = pd.DataFrame(columns=['talk','text','label'])

for talk in range(len(data)):
    laughter_count = 0
    # Split by laughter
    laughter_split = data['transcript'][talk].split('(Laughter)')
    talk_neutrals = []
    # Ignore talks that did not have laughter (i.e. did not split)
    if len(laughter_split) > 1:
        for i, split in enumerate(laughter_split):
            # Ignore the final split as it is not followed by laughter
            if i < len(laughter_split) -1:
                # Split the text block that was followed by laughter into sentences
                sentences = nltk.sent_tokenize(laughter_split[i])
                if len(sentences) >= 2:
                    laughter_count += 1
                    # Take the sentence immediately preceding the laughter and add it to our training data
                    # Take all other sentences from this block and add to a neutral_sentences list
                    # There is sometimes applause and laughter together (Applause)(Laughter). 
                    # For these cases we take the sentence preceding the applause and assume it also caused the laughter.
                    if sentences[-1] == "(Applause)":
                        laughter_utterance = sentences[-2]
                        neutral_utterances = sentences[:-2]
                        talk_neutrals += neutral_utterances
                        utterances = utterances.append({'talk':data['url'][talk], 'text': laughter_utterance, 'label': 'laughter'}, ignore_index=True)
                    else:
                        laughter_utterance = sentences[-1]
                        neutral_utterances = sentences[:-1]
                        talk_neutrals += neutral_utterances
                        utterances = utterances.append({'talk':data['url'][talk], 'text': laughter_utterance, 'label': 'laughter'}, ignore_index=True)
                # If the text block prior to laughter only has two sentences, we don't apply the (applause)(sentence) logic
                elif len(sentences) == 2:
                    laughter_count += 1
                    laughter_utterance = sentences[-1]
                    neutral_utterance = sentences[:-1]
                    talk_neutrals += neutral_utterances
                    utterances = utterances.append({'talk':data['url'][talk], 'text': laughter_utterance, 'label': 'laughter'}, ignore_index=True)
        # If there is at least one neutral sentence in the talk, take a random sample of the neutral sentences. N = no. of laughter sentences.
        # Random sample is without replacement if there are sufficient neutral sentences in the talk, with replacement otherwise.
        #if len(talk_neutrals) > 0:
        if laughter_count > len(talk_neutrals):
            random_neutral = random.sample(talk_neutrals, laughter_count)
        else:
            random_neutral = random.choices(talk_neutrals, k=laughter_count)
        for neutral in random_neutral:
            utterances = utterances.append({'talk':data['url'][talk], 'text': neutral, 'label': 'neutral'}, ignore_index=True)


In [0]:
# Add spaces between punctuation
for i in range(len(utterances)):
    utterances['text'][i] = re.sub('([.,!?()])', r' \1 ', utterances['text'][i])
    utterances['text'][i] = re.sub('\s{2,}', ' ', utterances['text'][i])

In [0]:
# Identify duplicate neutral utterances caused by sampling with replacement where there are insufficient neutral sentences in a talk
duplicaterows = utterances[utterances.duplicated()]
print(duplicaterows)

                                                    talk  \
38     https://www.ted.com/talks/ken_robinson_says_sc...   
111    https://www.ted.com/talks/david_pogue_says_sim...   
118    https://www.ted.com/talks/david_pogue_says_sim...   
121    https://www.ted.com/talks/david_pogue_says_sim...   
229    https://www.ted.com/talks/julia_sweeney_on_let...   
...                                                  ...   
17258  https://www.ted.com/talks/david_whyte_a_lyrica...   
17283  https://www.ted.com/talks/laolu_senbanjo_the_s...   
17421  https://www.ted.com/talks/jun_wang_how_digital...   
17455  https://www.ted.com/talks/theo_e_j_wilson_a_bl...   
17464  https://www.ted.com/talks/theo_e_j_wilson_a_bl...   

                                                    text    label  
38     If you look at the interactions of a human bra...  neutral  
111                             It's in your gift bag .   neutral  
118    You might've seen this , this is Apple's new l...  neutral  
121    

In [0]:
utterances.groupby('label').count()

Unnamed: 0_level_0,talk,text
label,Unnamed: 1_level_1,Unnamed: 2_level_1
laughter,8733,8733
neutral,8733,8733


In [0]:
summary = utterances.groupby('talk').count()
print(summary)

                                                    text  label
talk                                                           
https://www.ted.com/talks/9_11_healing_the_moth...     2      2
https://www.ted.com/talks/a_j_jacobs_year_of_li...    22     22
https://www.ted.com/talks/a_robot_that_flies_li...     4      4
https://www.ted.com/talks/a_ted_speaker_s_worst...     2      2
https://www.ted.com/talks/a_whistleblower_you_h...    22     22
...                                                  ...    ...
https://www.ted.com/talks/zeresenay_alemseged_l...     8      8
https://www.ted.com/talks/zeynep_tufekci_how_th...     2      2
https://www.ted.com/talks/zeynep_tufekci_machin...    14     14
https://www.ted.com/talks/zimchallenge\n               6      6
https://www.ted.com/talks/zubaida_bai_a_simple_...     2      2

[1802 rows x 2 columns]


In [0]:
utterances.to_csv('ted_training_pairs_NLTK2.csv',index=False)