In [15]:
import json

In [16]:
with open("data/train_full.json") as f:
    dataset = json.load(f)

In [17]:
len(dataset), dataset[:3]

(2778,
 [{'context': 'The decline of the city reached its nadir with the War of Spanish Succession (1702–1709) that marked the end of the political and legal independence of the Kingdom of Valencia. During the War of the Spanish Succession, Valencia sided with Charles of Austria. On 24 January 1706, Charles Mordaunt, 3rd Earl of Peterborough, 1st Earl of Monmouth, led a handful of English cavalrymen into the city after riding south from Barcelona, capturing the nearby fortress at Sagunt, and bluffing the Spanish Bourbon army into withdrawal.',
   'dialogId': -315877751,
   'evaluation': [{'breadth': 2,
     'engagement': 1,
     'quality': 4,
     'userId': 'Bob'},
    {'breadth': 1, 'engagement': 1, 'quality': 1, 'userId': 'Alice'}],
   'thread': [{'evaluation': 0,
     'text': 'Hi! As for me, I thought Spain got united much earlier',
     'userId': 'Bob'}],
   'users': [{'id': 'Alice', 'userType': 'Human'},
    {'id': 'Bob', 'userType': 'Human'}]},
  {'context': 'Eduardo Bonilla-Silv

In [18]:
filtered = []
for d in dataset:
    eval1 = d['evaluation'][0]['quality']
    eval2 = d['evaluation'][1]['quality']
    if len(d['thread']) < 4 and (eval1 > 3 or eval2 > 3):
        pass
    elif d['users'][0]['userType'] == 'Human' and d['users'][1]['userType'] == 'Human':
        pass
    elif len(d['thread']) == 0:
        pass
    else:
        filtered.append(d)

In [19]:
len(filtered)

2195

In [54]:
def get_label(val):
    if val < 3:
        return 0
    elif val == 3:
        return 1
    elif val > 3:
        return 2

In [55]:
from nltk import word_tokenize

In [56]:
dialogs = []
labels = []
for d in filtered:
    context = d['context']
    user_replicas = []
    bot_replicas = []
    if d['users'][0]['userType'] == 'Human' and d['users'][1]['userType'] == 'Bot':
        user = d['users'][0]['id']
        bot = d['users'][1]['id']
    else:
        user = d['users'][1]['id']
        bot = d['users'][0]['id']
    
    if d['evaluation'][0]['userId'] == bot:
        label = get_label(d['evaluation'][0]['quality'])
    else:
        label = get_label(d['evaluation'][1]['quality'])
    
    dialog = [('<SOD>', ['<SOD>'])]
    for r in d['thread']:
        words = [w.lower() for w in word_tokenize(r['text'])]
        if r['userId'] == user:
            dialog.append(('user', words))
        else:
            dialog.append(('bot', words))
    dialog.append(('<EOD>', ['<EOD>']))
    dialogs.append(dialog)
    labels.append(label)

In [57]:
dialogs[:5], len(dialogs), len(labels)

([[('<SOD>', ['<SOD>']),
   ('user', ['hi']),
   ('bot',
    ['who', 'uses', 'the', 'four', 'stages', 'of', 'civil', 'society', '?']),
   ('bot',
    ['ehh',
     'its',
     'incorrect',
     '.',
     'hint',
     ':',
     'first',
     '3',
     'answer',
     'letters',
     'is',
     '``',
     'fer',
     "''"]),
   ('user', ['what', 'is', 'your', 'name', '?']),
   ('bot', ['what']),
   ('bot', ['please', ',', 'speak', 'with', 'me', '.']),
   ('bot',
    ['please',
     ',',
     'speak',
     'with',
     'me',
     '.',
     'it',
     'gives',
     'me',
     'energy',
     'to',
     'live']),
   ('<EOD>', ['<EOD>'])],
  [('<SOD>', ['<SOD>']),
   ('bot',
    ['hello',
     'my',
     'friend',
     '.',
     'i',
     'hope',
     'you',
     'will',
     'enjoy',
     'this',
     'conversation',
     '.',
     'wait',
     'a',
     'second',
     'for',
     'my',
     'factoid',
     'question',
     '!']),
   ('bot', ['what', 'day', 'is', 'considered', 'a', 'crucifixio

In [58]:
user_bot_ix = {'user': 0, 'bot': 1, '<SOD>': 2, '<EOD>': 3}

In [59]:
def make_word_ix(dialogs):
    word_ix = {}
    vocab = set()
    for d in dialogs:
        for sent in d:
            for w in sent[1]:
                vocab.add(w)
    ix = 0
    for w in vocab:
        word_ix[w] = ix
        ix += 1
    return word_ix

In [60]:
word_ix = make_word_ix(dialogs)
len(word_ix.values())

10035

In [61]:
dialogs_vecs = []
for d in dialogs:
    d_vecs = []
    for sent in d:
        sent_bot_ix = []
        sent_word_ix = []
        for w in sent[1]:
            sent_word_ix.append(word_ix[w])
            sent_bot_ix.append(user_bot_ix[sent[0]])
        if sent_bot_ix:
            sent_vec = [sent_word_ix, sent_bot_ix]
            d_vecs.append([sent_vec])
    dialogs_vecs.append(d_vecs)

In [62]:
import pickle
with open('data/dilogs_and_labels.pickle', 'wb') as f:
    pickle.dump([dialogs_vecs, labels], f)

In [109]:
rows[0]['user']

['Hi', 'What is your name?']

In [110]:
rows_filtered = []
for entry in rows:    
    for e in entry['user']:
        user_entry = " ".join(['user_' + w.lower() for w in word_tokenize(e)])
    for e in entry['bot']:
        bot_entry = " ".join(['bot_' + w.lower() for w in word_tokenize(e)])
            
    rows_filtered.append({'user': user_entry, 'bot': bot_entry})

In [114]:
from collections import Counter
Counter(labels)

Counter({'__label__0': 1719, '__label__1': 347, '__label__2': 222})

In [116]:
with open('data.txt', 'w') as f:
    for ind, entry in enumerate(rows_filtered):
        print("{} {} {}".format(labels[ind], entry['user'], entry['bot']), file=f)

In [119]:
pred = []
with open('fasttext/predicted.txt', 'r') as f:
    for line in f:
        pred.append(line.strip())
true = []
with open('fasttext/test.txt', 'r') as f:
    for line in f:
        true.append(line.split(' ')[0].strip())
pred[:10], true[:10]

(['__label__1',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__0'],
 ['__label__0',
  '__label__0',
  '__label__1',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__0',
  '__label__1',
  '__label__0'])

In [122]:
from sklearn.metrics import classification_report

In [124]:
print(classification_report(true, pred))

             precision    recall  f1-score   support

 __label__0       0.75      0.88      0.81       173
 __label__1       0.11      0.06      0.08        33
 __label__2       0.00      0.00      0.00        22

avg / total       0.59      0.68      0.63       228

