<a href="https://colab.research.google.com/github/theAkhileshRai/NLP/blob/master/DataSetMaker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from __future__ import division, print_function
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import json
from collections import defaultdict
from nltk import wordpunct_tokenize
from scipy.stats import pearsonr, spearmanr

# New Section

# New Section

In [0]:
url = '/content/sample_data/train_full.json'

with open(url) as f:
    dataset = json.load(f)

In [0]:
human_human, human_bot = np.zeros((len(dataset), )), np.zeros((len(dataset), ))
for i, d in enumerate(dataset):
    human, bot = 0, 0
    for u in d['users']:
        if u['userType'] == 'Human':
            human += 1
        elif u['userType'] == 'Bot':
            bot += 1
        else:
            print('Unknown user type: {}'.format(u['userType']))
    if human == 2:
        human_human[i] = 1
    elif human == 1 and bot == 1:
        human_bot[i] = 1
    else:
        print('Unknown combination of users: human = {}, bot = {}'.format(human, bot))

In [0]:
silent_user = []
long_dialogue = []
empty_dialogue = []
for i, d in enumerate(dataset):
    user_utt = defaultdict(int)
    user_map = {}
    if len(d['thread']) == 0:
        empty_dialogue.append(i)
        continue
    for th in d['thread']:
        user_utt[th['userId']] += 1
    for u in d['users']:
        if u['userType'] == 'Human':
            user_map[u['id']] = 'human'
        elif u['userType'] == 'Bot':
            user_map[u['id']] = 'bot'
        else:
            print('Unknown user type: {}'.format(u['userType']))
    for u in user_map:
        if u not in user_utt:
            silent_user.append(i)
            break
    ok = False
    for u in user_map:
        if user_utt[u] > 2:
            if ok:
                long_dialogue.append(i)
            else:
                ok = True

In [60]:
print('\t\t\tTotal\tHuman-to-bot\tHuman-to-human')

# Total
hh_dialog = [d for i, d in enumerate(dataset) if human_human[i] == 1]
hb_dialog = [d for i, d in enumerate(dataset) if human_bot[i] == 1]
print('All dialogues\t\t{}\t\t{}\t\t{}'.format(len(dataset), len(hh_dialog), len(hb_dialog)))

# Empty
hh_empty = [d for i, d in enumerate(empty_dialogue) if human_human[i] == 1]
hb_emtpy = [d for i, d in enumerate(empty_dialogue) if human_bot[i] == 1]
print('Empty dialogues\t\t{}\t\t{}\t\t{}'.format(len(empty_dialogue), len(hh_empty), len(hb_emtpy)))

# One-sided
hh_silent = [d for i, d in enumerate(silent_user) if human_human[i] == 1]
hb_silent = [d for i, d in enumerate(silent_user) if human_bot[i] == 1]
print('One-sided dialogues\t{}\t\t{}\t\t{}'.format(len(silent_user), len(hh_silent), len(hb_silent)))

# Long dialogues
hh_long = [d for i, d in enumerate(long_dialogue) if human_human[i] == 1]
hb_long = [d for i, d in enumerate(long_dialogue) if human_bot[i] == 1]
print('Long dialogues\t\t{}\t\t{}\t\t{}'.format(len(long_dialogue), len(hh_long), len(hb_long)))

			Total	Human-to-bot	Human-to-human
All dialogues		2778		441		2337
Empty dialogues		119		66		53
One-sided dialogues	560		229		331
Long dialogues		1719		368		1351


In [0]:
from pandas.io.json import json_normalize
import pandas as pd

df_hb_dialog = pd.DataFrame.from_dict(json_normalize(hb_dialog), orient='columns')

In [62]:
df_hb_dialog['thread'].head()

0    [{'evaluation': 0, 'text': 'Hi', 'userId': 'Al...
1    [{'evaluation': 2, 'text': 'Hello my friend . ...
2                                                   []
3                                                   []
4    [{'evaluation': 0, 'text': 'Hi, what this text...
Name: thread, dtype: object

In [0]:
human_utt, bot_utt = [],[]
for i, d in enumerate(hb_dialog):
    users = {}
    for u in d['users']:
        if u['userType'] == 'Human':
            users[u['id']] = 'human'
        elif u['userType'] == 'Bot':
            users[u['id']] = 'bot'
    for th in d['thread']:
        if th['text'] != 0:
            try:
                if users[th['userId']] == 'human':
                    human_utt.append(th['text'])
                elif users[th['userId']] == 'bot':
                    bot_utt.append(th['text'])
            except KeyError:
                print('Unknown user {} in dialogue {}'.format(th['userId'], i))


In [64]:
len(bot_utt)

12602

In [65]:
len(human_utt)

12482

In [66]:
import pandas as pd
import numpy as np
human_utt = np.array(human_utt)
bot_utt = np.array(bot_utt)
df_train_bot = pd.DataFrame({'Bot': bot_utt})
df_train_bot.head()

Unnamed: 0,Bot
0,Who uses the four stages of civil society ?
1,Ehh its incorrect. Hint: first 3 answer letter...
2,What
3,"Please, speak with me."
4,"Please, speak with me. It gives me energy to live"


In [67]:
df_train_human = pd.DataFrame({'Human': human_utt})
df_train_human.head()

Unnamed: 0,Human
0,Hi
1,What is your name?
2,a Thursday or Wednesday
3,friday
4,13


In [75]:
df_train = pd.concat([df_train_human,df_train_bot],ignore_index=True,sort=False)
df_train.head()

Unnamed: 0,Human,Bot
0,Hi,
1,What is your name?,
2,a Thursday or Wednesday,
3,friday,
4,13,


In [0]:
df_train.to_csv('HumanBot.csv')

In [81]:
df_final = pd.read_excel("/content/sample_data/HumanBotInteraction.xlsm")
df_final.head()

Unnamed: 0,Index,Human,Bot
0,0,Hi,What?
1,1,What is your name?,I am bot
2,2,Bye,It was nice chatting with you! talk to you soo...
3,3,"No, i don't think so",Do you know Hashim in person?\n
4,4,Dynamics in the early mulsim world always stri...,before dying Prof.Muhammed(PBUH) named Muhamma...


In [82]:
df_final['Bot'].head()

0                                                What?
1                                             I am bot
2    It was nice chatting with you! talk to you soo...
3                      Do you know Hashim in person?\n
4    before dying Prof.Muhammed(PBUH) named Muhamma...
Name: Bot, dtype: object