## Dataset Creation with Circa, PersonaChat and DailyDialogs

In this notebook we will be creating a stratified training and test set. For this we will sample data from three datasets: the PersonaChat dataset, DailyDialogs and Google's Circa dataset (for binary yes/no questions and implicit answers)

### Datasets

In [1]:
import json
import re
import random
import pandas as pd
from pprint import pprint

PART = 'valid'

In [2]:
def load_personachat(path, part='train', k=1000, max_chars=200):
    triplets = []
    
    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)[part]
        
        # Loop through utterances in dialogs
        for i, item in enumerate(data):
            history = item['utterances'][-1]['history']
            personachat_id = 'personachat-{}-{}'.format(part, str(i).zfill(6))
            
            # Extract triplets
            for j in range(len(history) - 2):
                triplet = history[j:j + 3]
                
                # Verify that it is a question and a response
                if not triplet[1].endswith('?') or '?' in triplet[2]:
                    continue
                
                # Strip __SILENCE__ off
                if '__' not in ' '.join(triplet) and '' not in triplet:
                    triplets.append((personachat_id, triplet))
             
    # Limit length of triplets
    triplets = [(id_, tr, None) for id_, tr in triplets if len(' '.join(tr)) < max_chars]
                
    return random.sample(triplets, k)


def load_dailydialogs(path, part='train', k=1000, max_chars=200):
    triplets = []
    with open(path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            history = [turn.strip() for turn in line.split('__eou__')]
            daily_id = 'daily_dialogs-{}-{}'.format(part, str(i).zfill(6))
                        
            # Split punct off from tokens
            history = [re.sub("’", "'", h) for h in history] # weird apos
            history = [' '.join(re.findall(r"[\w']+|[.,!?;]+", h)) for h in history]
            
            for j in range(len(history) - 2):
                triplet = history[j:j + 3]
                
                # Verify that it is a question and a response
                if not triplet[1].endswith('?') or '?' in triplet[2]:
                    continue
                    
                if '' not in triplet:
                    triplets.append((daily_id, triplet))
                
    # Limit length of triplets
    triplets = [(id_, tr, None) for id_, tr in triplets if len(' '.join(tr)) < max_chars]
                
    # Shuffle deterministcally and create 50/50 train-test sets
    random.Random(1).shuffle(triplets) # shuffle deterministically
    n = len(triplets) // 2
    dataset = triplets[:n] if part == 'train' else triplets[n:]
    return random.sample(dataset, k)


def load_circa(path, part='train', k=1000, max_chars=200):
    triplets = []
    df = pd.read_csv(path, sep='\t')
    for _, row in df.iterrows():
        # Assign ID to question and answer
        circa_id = 'circa-{}-{}'.format(part, str(row['id']).zfill(6))
        question = row['question-X'].lower()
        answer = row['answer-Y'].lower()
        label = str(row['goldstandard1']).lower()
        
        # Split punct off from tokens
        question = ' '.join(re.findall(r"[\w']+|[.,!?;]+", question))
        answer = ' '.join(re.findall(r"[\w']+|[.,!?;]+", answer))
        
        # Extract alternative answers if there are any
        answers = []
        if label in ['yes', 'no']:           
            triplets.append((circa_id, [question, answer], label))
            triplets.append((circa_id, [question, label], label))
            triplets.append((circa_id, [question, '{} , {}'.format(label, answer)], label))
            
    # Limit length of triplets
    triplets = [(id_, tr, label) for id_, tr, label in triplets if len(' '.join(tr)) < max_chars]
           
    # Shuffle deterministcally and create 50/50 train-test sets
    random.Random(1).shuffle(triplets) # shuffle deterministically
    n = len(triplets) // 2
    dataset = triplets[:n] if part == 'train' else triplets[n:]
    return random.sample(dataset, k)
            

In [3]:
personachat = load_personachat('originals/personachat_self_original.json', part=PART)
circa = load_circa('originals/circa-data.tsv', part=PART)
dailydialogs = load_dailydialogs('originals/dialogues_text.txt', part=PART)

In [4]:
pprint(len(personachat))
pprint(len(circa))
pprint(len(dailydialogs))

1000
1000
1000


In [5]:
pprint(random.choice(personachat))
print()
pprint(random.choice(circa))
print()
pprint(random.choice(dailydialogs))

('personachat-valid-000446',
 ['oh i remember horse back riding when i was a kid . now my kids love it .',
  'do you have any horses ? are they in high school ?',
  "i've one we keep at a stable in kentucky 10 , 9 and 7"],
 None)

('circa-valid-007564',
 ['did you read any of the best sellers ?', "i don't have time to read ."],
 'no')

('daily_dialogs-valid-012426',
 ['I stabbed his belly three times .',
  'Did you know hat your actions might cause serous injuries or death ?',
  "I knew , but I couldn't myself ."],
 None)


### Augmenting Circa with additional context

In [6]:
from collections import defaultdict
from tqdm import tqdm
import spacy

nlp = spacy.load('en_core_web_sm')

In [7]:
def load_personachat_as_context(path, part='train'):
    utterances = list()
    inv_index = defaultdict(set)
    
    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)[part]
        
        # Loop through utterances in dialogs
        j = 0
        for item in data:
            history = item['utterances'][-1]['history']
            
            for utterance in history:
                # Record utterance
                if '?' not in utterance:
                    utterances.append(utterance)

                    # Index tokens in utterance
                    tokens = re.findall(r"[\w']+|[.,!?;]", utterance.lower())
                    for token in tokens:
                        inv_index[token].add(j)
                    j += 1
                
    return utterances, inv_index

In [8]:
personachat_contexts, personachat_index = load_personachat_as_context('originals/personachat_self_original.json', part=PART)
print('turn with "whittling":', personachat_contexts[1])
print('turns with "whittling":', personachat_index['whittling'])

turn with "whittling": i just got done watching a horror movie
turns with "whittling": set()


In [9]:
print('Possible contexts:', len(personachat_contexts))

Possible contexts: 10127


In [10]:
# Defines dependency relations for topic words
TOPIC_DEPS = ['nsubj', 'nsubjpass', 'dobj', 'pobj']
TOPIC_POS = ['NOUN', 'PROPN']

circa_augmented = []
for id_, (question, answer), label in tqdm(circa):
    # Identify spans of tokens in the question that determine its topic
    doc = nlp(question)
    topics = [t for t in doc if t.dep_ in TOPIC_DEPS and t.pos_ in TOPIC_POS]
    topic_tokens = [' '.join([t.lower_ for t in span.subtree]) for span in topics]
    topic_tokens += [t.lower_ for t in doc if t.pos_ in TOPIC_POS]
    
    # Find utterances that match
    matches = set()
    for topic in topic_tokens:
        if topic in personachat_index:
            for i in personachat_index[topic]:
                matches.add(personachat_contexts[i])
                
    # Score matches
    if len(matches) > 2:
        best_match = max(matches, key=lambda m: sum([1 if t in m else 0 for t in topic_tokens]))
        circa_augmented.append((id_, (best_match, question, answer), label))

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:06<00:00, 154.83it/s]


In [11]:
pprint(random.choice(circa_augmented))

('circa-valid-019475',
 ('cool . . . when i have a break from my horse farm i like to watch movies',
  'do you like to watch movies ?',
  'yes'),
 'yes')


# Creating a stratified dataset

## Ellipsis

In [12]:
def ellipsis_subject(rdoc):
    subjects = [token for token in rdoc if 'subj' in token.dep_]  # nsubjpass, nsubj, etc.
    predicate = [token for token in rdoc if token.dep_ == 'ROOT' and token.pos_ in ['VERB', 'AUX']]
    return not subjects and predicate

def ellipsis_predicate(rdoc):
    predicate = [token for token in rdoc if token.dep_ == 'ROOT' and token.pos_ in ['VERB', 'AUX']]
    return not predicate

## Response types

In [13]:
CONFIRM = ['yes', 'sure', 'maybe', 'yeah', 'yea', 'yup']
DENY = ['no', 'maybe', 'nah', 'nope', 'neh']


def is_confirm(rdoc):
    if len(rdoc) == 1:
        return rdoc[0].lower_ in CONFIRM 
    elif len(rdoc) == 2:
        return rdoc[0].lower_ in CONFIRM and rdoc[1].dep_ == 'PUNCT'
    return False

def is_deny(rdoc):
    if len(rdoc) == 1:
        return rdoc[0].lower_ in DENY 
    elif len(rdoc) == 2:
        return rdoc[0].lower_ in DENY and rdoc[1].dep_ == 'PUNCT'
    return False
    
def is_confirm_with_elaboration(rdoc):
    return not is_confirm(rdoc) and rdoc[0].lower_ in CONFIRM

def is_deny_elaboration(rdoc):
    return not is_confirm(rdoc) and rdoc[0].lower_ in DENY 

def is_implicit_confirm(label, rdoc):
    return label == 'yes' and not is_confirm(rdoc) and not is_confirm_with_elaboration(rdoc)

def is_implicit_deny(label, rdoc):
    return label == 'no' and not is_deny(rdoc) and not is_deny_elaboration(rdoc)

## Question type

In [14]:
from dialog_tag import DialogTag

model = DialogTag('distilbert-base-uncased')

distilbert-base-uncased found in cache. Loading model...


Some layers from the model checkpoint at C:\Users\Uw naam/.dialog-tag/models\distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['dropout_59']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at C:\Users\Uw naam/.dialog-tag/models\distilbert-base-uncased and are newly initialized: ['dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and

In [15]:
def is_yesno_question(tag):
    return tag == 'Yes-No-Question'

def is_wh_question(tag):
    return tag == 'Wh-Question'

def is_declarative_yesno_question(tag):
    return tag == 'Declarative Yes-No-Question'

def is_open_question(tag):
    return tag == 'Open-Question'

def is_rhetorical_question(tag):
    return tag == 'Rhetorical-Questions'

def is_declarative_wh_question(tag):
    return tag == 'Declarative Wh-Question'

## Coreferring expressions

In [16]:
def has_reflexive_pronoun(rdoc):
    return [t for t in rdoc if t.lower_ in ['myself', 'yourself', 'ourselves', 'himself', 'herself', 'themselves']]

def has_possessive_pronoun(rdoc):
    return [t for t in rdoc if t.tag_ == 'PRP$' and t.lower_ in ['my', 'mine', 'your', 'our', 'his', 'her', 'its']]

def has_demonstrative_dets(rdoc):
    return [t for i, t in enumerate(rdoc[:-1]) if t.lower_ in ['this', 'these', 'that'] and rdoc[i + 1].pos_ != 'NOUN']

def has_singular_personal_pronoun(rdoc):
    return [t for t in rdoc if t.tag_ == 'PRP' and t.lower_ in ['i', 'me', 'you', 'it', 'he', 'she']]

def has_plural_personal_pronoun(rdoc):
    return [t for t in rdoc if t.tag_ == 'PRP' and t.lower_ in ['we', 'they', 'them']]

## Putting it all together

In [17]:
triplets = []

used_data = personachat + dailydialogs + circa_augmented

for id_, triplet, label in tqdm(used_data):
    triplet_info = {'id': id_, 'triplet': '<eos>'.join(triplet), 'categories': []}
    
    qdoc, rdoc = nlp(triplet[1]), nlp(triplet[2])
    qtag = model.predict_tag(triplet[1])
    
    # Ellipsis
    if ellipsis_subject(rdoc):
        triplet_info['categories'].append('ellipsis_subj')
        
    if ellipsis_predicate(rdoc):
        triplet_info['categories'].append('ellipsis_pred')
        
    # Implicit response
    if is_implicit_confirm(label, rdoc):
        triplet_info['categories'].append('act_impl_confirm')
        
    if is_implicit_deny(label, rdoc):
        triplet_info['categories'].append('act_impl_deny')
        
    # Coreferring expressions
    if has_reflexive_pronoun(rdoc):
        triplet_info['categories'].append('reflexive_prons')

    if has_possessive_pronoun(rdoc):
        triplet_info['categories'].append('possessive_prons')

    if has_demonstrative_dets(rdoc):
        triplet_info['categories'].append('demonstrative_dets')

    if has_singular_personal_pronoun(rdoc):
        triplet_info['categories'].append('singular_personal_prons')

    if has_plural_personal_pronoun(rdoc):
        triplet_info['categories'].append('plural_personal_prons')
        
    # (Response) Dialog acts
    if is_confirm(rdoc):
        triplet_info['categories'].append('act_confirm')
        
    if is_deny(rdoc):
        triplet_info['categories'].append('act_deny')
        
    if is_confirm_with_elaboration(rdoc):
        triplet_info['categories'].append('act_confirm_elaborate')
        
    if is_deny_elaboration(rdoc):
        triplet_info['categories'].append('act_deny_elaborate')
        
    # (Question) Dialog acts
    if is_yesno_question(qtag):
        triplet_info['categories'].append('yes_no_question')
        
    if is_wh_question(qtag):
        triplet_info['categories'].append('wh_question')
        
    if is_declarative_yesno_question(qtag):
        triplet_info['categories'].append('declarative_yes_no_question')
        
    if is_open_question(qtag):
        triplet_info['categories'].append('open_question')
        
    if is_rhetorical_question(qtag):
        triplet_info['categories'].append('rhetorical_question')
        
    if is_declarative_wh_question(qtag):
        triplet_info['categories'].append('declarative_wh_question')
    
    # Sentence length
    triplet_info['categories'].append('length_{}'.format(len(rdoc)))
        
    triplets.append(triplet_info)
        
        
with open('{}.json'.format(PART), 'w', encoding='utf-8') as file:
    json.dump(triplets, file)

  0%|                                                                                         | 0/2671 [00:00<?, ?it/s]

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


  0%|                                                                               | 1/2671 [00:05<4:13:18,  5.69s/it]



  0%|                                                                               | 2/2671 [00:07<2:28:25,  3.34s/it]



100%|██████████████████████████████████████████████████████████████████████████████| 2671/2671 [04:06<00:00, 10.86it/s]
