## Spoken Conversational Search Dataset Analysis

In [2]:
data_path = "/home/shared/cm_data/SCSdataset.csv"

import pandas as pd
df = pd.read_csv(data_path)
print(df.head(5))

# set up connection to the MongoDB: sudo service mongod start (27017 is the default port)

from pymongo import MongoClient
import json

class Mongo_Connector():
    '''
    Wrapper class for some of the pymongo functions: http://api.mongodb.com/python/current/tutorial.html
    '''

    def __init__(self, db_name):
        # spin up database
        self.mongo_client = MongoClient()
        self.db = self.mongo_client[db_name]
    
    def count_all_docs(self, collection_name):
        count = self.db[collection_name].count_documents({})
        print ("%d dialogues in %s" % (count, collection_name)) 

 
db_name = 'cm'
col_name = 'scs'
# connect to MongoDB
mongo = Mongo_Connector(db_name)
mongo.count_all_docs(col_name)
collection = mongo.db['scs']

  Start.time Stop.time                           Query Query.complexity  \
0    00:02.7   00:11.6  Where does cinnamon come from?         remember   
1    00:40.9   00:51.9  Where does cinnamon come from?         remember   
2    00:52.6   01:01.5  Where does cinnamon come from?         remember   
3    01:02.3   01:11.8  Where does cinnamon come from?         remember   
4    01:12.5   01:28.6  Where does cinnamon come from?         remember   

         Role                                             Action  \
0      A_User                      A:Initial information request   
1  B_Receiver                        B:SERP without modification   
2      A_User  A:Access source + Information request within d...   
3  B_Receiver                    B:Within-Document search result   
4      A_User  A:Access source + Information request within d...   

                                          Transcript  Query.counter  \
0  In which countries... in which European countr...              1  

In [11]:
# show all distinct actions
print("%d utterances" % len(df.Action))
complex_action_types = set(df.Action)
print("with %d distinct complex labels" % len(complex_action_types))
# print(action_types)

# encode labels
import string
alphabet = list(string.printable)
special_symbols = ['O', '<', '>']  # keep one symbol reserved for padding
alphabet = [x for x in alphabet if (x not in special_symbols) and x.isnumeric() == False]
print("Alphabet length: %d symbols"%len(alphabet))

labels2ids = {}
ids2labels = {'O': 'x', '<': 'START', '>': 'END'}
i = 0
for a in complex_action_types:
    for a_type in a[2:].split(' + '):
        if a_type not in labels2ids:
            labels2ids[a_type] = alphabet[i]
            ids2labels[alphabet[i]] = a_type
            i += 1
print("%d label types" % len(labels2ids))
# print(labels2ids.keys())
for k in alphabet:
    if k in ids2labels:
        print(k, ids2labels[k])

1044 utterances
with 135 distinct complex labels
Alphabet length: 87 symbols
83 label types
a Within SERP search result
b Query rephrase
c SERP overview without modification
d SERP without modification
e Scanning document with modification
f Access source (implicit)
g Access source feedback-request
h Performance feedback
i Recommendations
j Automated repetitive search
k Within-Document search result
l Asking what they are looking for
m Creating bigger picture
n between-document navigation
o Enquiry for further information
p Requests more details about information request
q SERP Card
r Scanning document without modification BUT with interpretation of photos
s Query refinement offer
t Provides information about the search engine
u Feedback on what is happening
v Information request
w Repeats the query back
x Previously seen results
y Scanning document without modification
z Relevance judgement
A Query embellishment
B Results?
C Google query expansion suggestion
D Within-Document search r

In [12]:
# 1. get traces of functional labels from MongoDB
cursor = collection.find({})

# interate over conversations to collect traces and utterance labels
traces = []
utterances = []
for doc in cursor:
    _id = doc['_id']
    # record trace as the sequence of labels
    labels_trace = '<'
    roles, sentiments = '<', '<'
    print(doc['query'])
    for turn in doc['turns']:
        labels = turn['labels']
        print(labels)
        print(turn['text'])
        for label in labels:
#             print(label)
            labels_trace += labels2ids[label]
            roles += turn['role'][0]
            sentiments += turn['sentiment']
            utterances.append(turn['role'][0]+labels2ids[label]+turn['sentiment'])
    # save in mongo
    fingerprint = {'labels': ''.join(labels_trace)+'>', 'roles': roles+'>', 'sentiments': sentiments+'>'}
    doc['traces'] = fingerprint
    doc['length'] = len(fingerprint['roles'])
    collection.update_one({'_id': _id}, {"$set": doc}, upsert=True)
    traces.append(fingerprint)
#     break
print("%d traces collected"%len(traces))
print("Sample trace: %s"%traces[17])
for key, value in traces[17].items():
    print(value)
# print("Sample trace: %s"%[ids2labels[label_id] for label_id in traces[15]])
print(utterances[0])

Where does cinnamon come from?
['Initial information request']
In which countries... in which European countries do they grow cinnamon?
['SERP without modification']
There's a bunch of results have come but first one is Wikipedia uhm which just talks about cinnamon but the next one talks about the top five uhm cinnamon producing countries
['Access source', 'Information request within document']
OK so in the top five in the second search result is one of those top five countries a European country?
['Within-Document search result']
I'll just go in and have a look uhm no so we've got Indonesia, China, Vietnam, Sri Lanka and Madagascar so I can back
['Access source', 'Information request within document']
Can you just check in the Wikipedia, so open the Wikipedia page and maybe just scan uhm about where cinnamon is grown or where it has been migrated to
['Within-Document search result', 'Relevance judgement']
Basically the definition of the word comes up and it says it is derivative from 

Outsource job India
['Initial information request']
What US jobs have been outsourced to India
['Query refinement offer']
Can I put uhm United States?
['Confirms']
Yeah
['SERP without modification']
[inaudible segment] Uhm job outsourcing to india benefits outsourcing services home this is from outsource2india dot com uhm... being situated on the other side of the globe from North America India has the advantage of blahblahblah. why outsource to India a recent survey eighty percent of Europe and US outsourcing firms ranked India as their number [inaudible segment] there is a how to [inaudible segment] outsourcing jobs uhm affects the US economy
['Access source']
Uhm OK can you put bring up that
['Checks navigational command']
This one? The outsourcing affects the US economy
['Confirms']
Yeah
['Scanning document without modification']
Yeah so how outsourcing jobs affects the US economy this is about money from a US economy expert
['Information request within document']
Alright if you go

per capita alcohol consumption
['Initial information request']
Please compare first word I'd like compare average international uhm and I'd like in inverted commas the words alcohol consumption... close inverted commas and then in open inverted commas per capita that's PER capita CAPITA close inverted commas... and that's if you could return the searches for that please
['Asks to repeat']
Sorry what, can you repeat that sentence
['Query repeat']
Compare average open inverted commas oh sorry compare average international open inverted commas alcohol consumption close inverted commas open inverted commas per capita close inverted commas
['SERP without modification']
OK so the first result is Wikipedia and it's a list of countries by alcohol consumption per capita uhm and then the second source is also Wikipedia and that's uhm beer consumption per capita and then we have a pdf document from the World Health Organisation on global status report on alcohol and health uhm and we have a CNBC 

per capita alcohol consumption
['Initial information request']
OK so... uhm in general I sort of want to try and find out the average consumption of alcohol uhm by by the country state for local level, so maybe just start of with type in uhm alcohol consumption by country... tell me if anything relates to statistics uhm about alcohol consumption uhm between countries
['SERP without modification']
So I have one document about global consumption of alcohol by the World Bank
['Access source']
Yeah do you want to click into it and see what it says
['SERP without modification']
One sec [long pause] so I have list of countries by alcohol consumption per capita
['Performance feedback']
Yeah yeah that's perfect
['Scanning document without modification']
This is by Wikipedia [long pause] so this is uhm by twenty ten World Health Organisation... the data uhm the first one is Belarus the first country
['Information request within document']
How much do they consume?
['Within-Document search resul

Outsource job India
['Initial information request']
Search for... US unemployment plus outsourcing
['Query refinement offer']
So you would like to ask about the US unemployment rate or range or number or
['Intent clarification']
Uhm... well uhm not the rate or anything like that uhm it's just the concept of unemployment unemployed people
['Definition clarification']
And what is outsourcing?
['Definition explanation']
Outsourcing is where... uhm you're sending jobs offshore or... uhm you're subcontracting jobs out to like a third party instead of hiring employees... to send out
['Query refinement offer']
You like to find the relation between unemployment and outsourcing like causality effect
['Intent clarification', 'Information request within SERP']
Uhm not necessarily as strong as a causa uhm a causality but... can you search uhm is there anything in your search results that indicate uhm some content about outsourcing jobs specifically to India
['SERP without modification', 'Asking ab

Marine vegetation
['Initial information request']
Health benefits of marine vegetation
['SERP without modification']
Searching health benefits of marine vegetation, so the first article that comes up reap the health benefits of tiny marine plants by FoodTriends, second article sea vegetables the world's healthiest foods, third article [inaudible segment] the benefits of marine phytoplankton by Wellness Mama
['Asks to repeat first search result']
Uhm benefits of marine plankton uhm what was the first one again?
['SERP without modification']
Reap the health benefits of tiny marine plants by FoodTriends
['Info about document']
FoodTrients, is that a commercial site?
['Source information', 'Interpretation']
It's uhm it's a dot com so, let's have a look [long pause] I believe it would be a commercial site it seems like there's advertising on the site
['Interpretation', 'Asks to repeat Nth search result']
Yeah OK so they're trying to sell products uhm so what was the second one? Was that was

New Caledonia
['Initial information request']
Can you please search New Caledonia and also the word language?
['Asks to repeat']
Can you please uhm repeat the question?
['Query repeat', 'Offers to spell']
Uhm New Caledonia, would you like me to spell it?
['Confirms']
Uhm yeah please
['Query repeat']
OK, so the word new and then Caledonia is C A L
['SERP without modification']
OK got it [long pause] It's official language is French
['Confirms', 'Information Request']
Yeah uhm [long pause] can I ask any question? Are there any other languages coming up as being spoken there?
['Scanning document without modification']
There are the native speakers use Bislama
['Confirms']
OK
['Scanning document without modification']
It's an Anglo Malaysian language
['Enough information']
OK that's probably enough information
Water Iraq
['Initial information request']
Can you please search Iraq water resources [long pause] and then the word Turkey as well
['Query refinement offer']
Is it about Iraq's wate

cinnamon
['Initial information request']
Where does cinnamon come from?
['Asks to repeat']
Sorry say that again
['Query repeat']
Where does cinnamon come from?
['Requests spelling']
How do you spell cin
['Spells']
Uhm CINNACON
['SERP Card']
Well the first result I get most of the world's Ceylon cinnamon so that is cinnamon that comes from Sri Lanka... and then we've got another type I think Cassia that's spelled CASSIA if that's the right pronunciation Cassia cinnamon which comes from Indonesia
['Asks to repeat']
Indonesia
['SERP Card']
Yes, the rest, so the other types of cinnamon that comes from China Vietnam and Burma
Airport security
['Initial information request']
New security measures at airports
['Query refinement offer']
So new security measures at airport, which airport?
['Query embellishment']
Uhm just airports in general
['Google query expansion suggestion']
Well it's come up with new security measures at airports, after September eleven one of those?
['Info about SERP overv

Alcohol consumption
['Initial information request']
OK so I need you to look up uhm... Australian alcohol consumption per capita
['Confirms']
Yeah
['Results?']
So what's the first result that you get do you get a
['SERP without modification']
I have like a list of countries alright OK hmmm... so I have like some few articles about saying consumption, there's a list of countries, there is
['Access source']
Uhm, can you give me the list of countries?
['Query refinement offer']
So, are you looking for Australia then?
['Intent clarification']
Uhm I want to see what we are compared to other countries first
['Scanning document without modification']
OK so... so they have some a table regarding pure alcohol consumption among adults
['Confirms']
Yeah
['Scanning document without modification']
In litres per capita per year and there the data is for year two thousand ten
['Confirms']
Yeah
['Scanning document without modification']
So they have consumptions of beer, wine, spirits and others
['Inf

Alcohol consumption
['Initial information request']
Can you start with alcohol consumption uhm in different countries
['SERP without modification', 'Enquiry for further information']
I have scholar articles as well as I've got alcohol consumption per capita, would you like to see more of that?
['Confirms']
Yeah yeah that's the one
['Scanning document with modification']
Uhm I've got a list of every country and it comes with as well a breakdown of what type of alcohol it is
['Information request within document']
Yeah uhm can you read out more about that does it start with something
['Within-Document search result', 'Requests more details about information request']
Uhm yeah for every country there are a hundred is there a particular country you are interested in?
['Intent clarification']
Uhm, all the countries all the states and all groups of people
['Scanning document without modification']
Belarus has the highest consumption
['Utter']
Well so they are saying
['Scanning document witho

In [13]:
# show a sampe dialogue
sample_doc = collection.find_one()
print(sample_doc['traces'])

{'labels': '<Vd/&k/&kzvd/dM/&de&>', 'roles': '<ABAABAABBABABBAABBA>', 'sentiments': '<OOOOOOOOOOOOOOOOOOO>'}


In [15]:
# show all traces
for doc in collection.find():
    print(doc['traces']['labels'][1:-1])

Vd/&k/&kzvd/dM/&de&
VsHdMK:M%dM/:L%I!~"Mvs%d/!y|&kMv#QsLsHd/y&k&yvsvsH#Hs%d/:%y&klH&%v#%xd]Iu
Vs%d/G%y&y#%yzvw%d/eLylHe&kv#AdM/y&kJkMMkMMz%G%d
VdA#A%_%A"Ad*"h*dfG%o*dh/e u&k
V#Qd/e&kh`k&k&kmn"f%&ks%s%d
Vq
Vy&k%y#X%y%%vdMH%Hpvd/e#X&khsHy&%&k%"#%Lvd#%/yhiyhvsAy/&y
Vq#Xvhehvy,e#%vy,u%d#%gk#%vy#Xve#XP&k{\%svwBu#%Iavd/y#XyvyM&%%evC#q
Vd/dhy&k%pH%K:#%#%&%&ygyMy&k&kh`k&k&k#%&k&k&kvs#sHyh&y,eM?#w,u#wAB%BuBe/wA
V%B#Q"/G%.d%G%&%%
VqIe&kvd'&k&k&k&k
V%B"Ia%%Ia
VsH}^sHId'/d'HsHd*X/y#X#%&kd/yhK:HsH
V#Abqvd%yo#X&k
VsHdsHdG%y,es%dsM(d/&G%z#zK:&e%eV#A#Ad'#%%d/G%y&eo&yo%y&k%(s%"p&pHug%*y&%Uy%(#QwA#Qwx%d~d/:%o%y
Vd*dK:MM[dh/G&k&!%eM`EvqZqhq/G%yLG%(wdSvqo%qIa%
Vq
Vds%sHs%sHs%kd#X%"vsH#v#%bu%vs%deHk#ke%eMe%u$u%
VqvqvsAbHyveMHy&d%
Vb%s%PveMMevsHesHe
V#QR%Qd%(y%yY
VsHy&yMeh
VT#T,y
V"veve
V"Hev"#HrvyMve-yM
Ve#%He
V#Q+@q#q
VsACZ"He&ezMM%%eM
V#As%Ce&#&yoHbAbQy&eM
VsHsH%Bu%dvu%y%y%y%%v%ve;&u&eMvkM)%e
V%Bd/sHy%y%y&#&k&y%y%y%y&#%kv%H#Ik%y%%K%%e%yv#%.k%e&y#%%y&#Hy%%&k%e&k
V#Aq#"%%v#%dvd%d
V#Qd%dHy%yFGtGv%%WH"I=%sHd

In [186]:
# show smallest traces
for doc in collection.find():
    if(doc['length'] < 15):
        for turn in doc['turns']:
#             _len += len(turn['labels'])
#             if _len > index and _len <= index + len(pattern):
            print(turn['labels'])
        print('\n')

['Initial information request']
['SERP Card']


['Initial information request']
['Confirms']
['Results?']
['SERP with modification']
['Information request within SERP']
['Within SERP search result']
['Confirms']
['Confirms']
['Information request within SERP']
['Within SERP search result']


['Initial information request']
['SERP Card']


['Initial information request']
['Query refinement offer']
['Intent clarification']
['Scanning document without modification']
['Information request within document']
['Scanning document without modification']
['Interpretation']
['Scanning document with modification']
['Performance feedback']


['Initial information request']
['Image overview on SERP']
['Asks to repeat']
['Image overview on SERP']
['Is there more information']
['Scanning document without modification']


['Initial information request']
['SERP with modification']
['Information request']
['Scanning document with modification']
['Information request']
['Scanning document with modificatio

In [116]:
# what are the utterances per role?
u_by_roles = {}
for u in utterances:
    role = u[0]
    label = u[1]
    if role not in u_by_roles:
        u_by_roles[role] = defaultdict(int)
    u_by_roles[role][label] += 1

for role in u_by_roles:
    print("role %s utterances %d" % (role, sum(u_by_roles[role].values())))
    for label_id, count in positive_by_roles[role].items():
        print("label %s count %d" % (ids2labels[label_id], count))
    print('\n')

NameError: name 'defaultdict' is not defined

In [117]:
# are there labels shared between different roles? Yes more than half
print(len(set(u_by_roles['A'].values())))
print(len(set(u_by_roles['B'].values())))
print(len(set(u_by_roles['A'].values()) & set(u_by_roles['B'].values()))/len(set(u_by_roles['B'].values()))) 

KeyError: 'A'

In [118]:
# which utterances have positive sentiment?
positive_utterances = []
from collections import defaultdict
positive_by_roles = {}
for u in utterances:
    if u[2] == 'P':
        role = u[0]
        label = u[1]
        if role not in positive_by_roles:
            positive_by_roles[role] = defaultdict(int)
        positive_by_roles[role][label] += 1
        # add unique type
        if u[:2] not in positive_utterances:
            positive_utterances.append(u[:2])
print("%d positive utterance types"%len(positive_utterances))
print(positive_utterances)
print(positive_by_roles)
# who (which role) is using more positive utterances
for role in positive_by_roles:
    print('\n')
    print("role %s positive %d" % (role, sum(positive_by_roles[role].values())))
    for label_id, count in positive_by_roles[role].items():
        print("label %s count %d" % (ids2labels[label_id], count))

0 positive utterance types
[]
{}


In [119]:
# find sample utterance by label
cursor = mongo.db.find()
for doc in cursor:
    for turn in doc['turns']:
        if 'P' in turn['sentiment']:
            print (turn['text'])
#             break

TypeError: 'Collection' object is not callable. If you meant to call the 'find' method on a 'Database' object it is failing because no such method exists.

In [120]:
# build a tree
# https://stackoverflow.com/questions/34964878/python-generate-a-dictionarytree-from-a-list-of-tuples/35049729

forest = []

for u in utterances:
    print(u)
    break

AzO


In [162]:
# 2. extract sequences frequent across multiple traces
# https://stackoverflow.com/questions/40556491/how-to-find-the-longest-common-substring-of-multiple-strings

from functools import partial, reduce
from itertools import chain
from typing import Iterator

from collections import Counter


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))


def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
    lengths = range(minn, maxn) if maxn else range(minn, len(seq))
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def frequent_ngrams(strings, min_support=None, topn=5, drop_substrings=False):
    
    # 1.split traces into ngrams
    seqs_ngrams = map(allngram, strings)

    # 2.count ngram frequencies
    counts = Counter(chain.from_iterable(seqs_ngrams))
    return counts.most_common(topn)

    # 3.filter frequent substrings
    # set frequency threshold if not specified
    if not min_support:
        most_frequent_s = [s for s, count in counts.most_common(topn)]
        # maximum frequency
#         most_frequent1 = counts.most_common(1)[0]
#         min_support = most_frequent1[1]
    else:
#         print(min_support)
        most_frequent={string: count for string, count in counts.items() if count >= min_support}
    #     print(most_frequent)
        most_frequent_s = list(most_frequent.keys())
    
    # 4.drop substrings
    if drop_substrings:
        most_frequent_s.sort(key=len, reverse=True)
        #     print(most_frequent_s)
        lfss = [most_frequent_s[0]]
        for s in most_frequent_s[1:]:
            overlap = False
            for lfs in lfss:
                if s in lfs:
                    overlap = True
                    break
            if not overlap:
                lfss.append(s)
    # result: longest frequent substrings with counts
    return lfss, [counts[s] for s in lfss]


# print(frequent_ngrams(traces, topn=50))
# print(len(traces))
labels_only = [trace['labels'] for trace in traces]
# print(labels_only[0])
patterns = frequent_ngrams(labels_only, min_support=2000, topn=100000000)
print(len(patterns))
for pattern in patterns:
    sequence, count = pattern
    # all starting patterns
    if sequence[0] == '<' or sequence[-1] == '>':
#     if len(sequence) > 4:
        print(pattern)
        labels = [ids2labels[label_id] for label_id in sequence]
        print(labels, count/37)

28889
('<z', 37)
['START', 'Initial information request'] 1.0
('>', 37)
['END'] 1.0
('<', 37)
['START'] 1.0
('<zj', 9)
['START', 'Initial information request', 'Asks to repeat'] 0.24324324324324326
('<zG', 8)
['START', 'Initial information request', 'Query refinement offer'] 0.21621621621621623
('<zc', 6)
['START', 'Initial information request', 'SERP without modification'] 0.16216216216216217
('<zGN', 6)
['START', 'Initial information request', 'Query refinement offer', 'Intent clarification'] 0.16216216216216217
('<zjp', 6)
['START', 'Initial information request', 'Asks to repeat', 'Query repeat'] 0.16216216216216217
('b>', 5)
['Within-Document search result', 'END'] 0.13513513513513514
('ab>', 5)
['Information request within document', 'Within-Document search result', 'END'] 0.13513513513513514
('<zS', 5)
['START', 'Initial information request', 'SERP Card'] 0.13513513513513514
('7>', 5)
['Confirms', 'END'] 0.13513513513513514
('S>', 4)
['SERP Card', 'END'] 0.10810810810810811
('d>'

['START', 'Initial information request', 'Query rephrase', 'Confirms', 'Query refinement offer', 'Confirms', 'Multi-document summary', 'Information request', 'Scanning document with modification', 'Interpretation', 'Interpretation', 'Scanning document with modification', 'Information request', 'Query refinement offer', 'Intent clarification', 'Scanning document with modification', 'Query refinement offer', 'Intent clarification'] 0.02702702702702703
('*md33dmGNdGNd>', 1)
['Multi-document summary', 'Information request', 'Scanning document with modification', 'Interpretation', 'Interpretation', 'Scanning document with modification', 'Information request', 'Query refinement offer', 'Intent clarification', 'Scanning document with modification', 'Query refinement offer', 'Intent clarification', 'Scanning document with modification', 'END'] 0.02702702702702703
('p=7pc7Zf7f2>', 1)
['Query repeat', 'Offers to spell', 'Confirms', 'Query repeat', 'SERP without modification', 'Confirms', 'Inform

In [178]:
# search: find sample utterance by label pattern
pattern = "<zjpc"
cursor = collection.find({})
for doc in cursor:
    if pattern in doc['traces']['labels']:
        index = doc['traces']['labels'].find(pattern)
        _len = 0
        for turn in doc['turns']:
#             _len += len(turn['labels'])
#             if _len > index and _len <= index + len(pattern):
            print(turn['labels'])
            print("%s: %s"%(turn['role'], turn['text']))
        break

['Initial information request']
A_User: Please compare first word I'd like compare average international uhm and I'd like in inverted commas the words alcohol consumption... close inverted commas and then in open inverted commas per capita that's PER capita CAPITA close inverted commas... and that's if you could return the searches for that please
['Asks to repeat']
B_Receiver: Sorry what, can you repeat that sentence
['Query repeat']
A_User: Compare average open inverted commas oh sorry compare average international open inverted commas alcohol consumption close inverted commas open inverted commas per capita close inverted commas
['SERP without modification']
B_Receiver: OK so the first result is Wikipedia and it's a list of countries by alcohol consumption per capita uhm and then the second source is also Wikipedia and that's uhm beer consumption per capita and then we have a pdf document from the World Health Organisation on global status report on alcohol and health uhm and we hav

In [105]:
# analyse vocabulary usage
import string
from collections import Counter
import urllib.request
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
englishStemmer = SnowballStemmer("english")

def load_word_list(lang='en'):
    url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-%s/master/stopwords-%s.txt' % (lang, lang)
    stopwords = urllib.request.urlopen(url).read().decode('UTF-8').split()
    print("Loaded %s stopwords, e.g. %s" % (lang, ", ".join(stopwords[:2])))
    return set(stopwords)


# load stopwords
en_stoplist = load_word_list('en')

cursor = collection.find({})
vocabulary = Counter()
for doc in cursor:
    _id = doc['_id']
    # record trace as the sequence of labels
    labels_trace = ''
    roles, sentiments = '', ''
#     print(doc['query'])
    for turn in doc['turns']:
        labels = turn['labels']
#         print(labels)
        # pre-processing: strip punctuation, lower-case, split on white spaces, remove stopwords, lemmatize with WordNet
        # or stem with Snowball stemmer
        words = turn['text'].translate(str.maketrans('', '', string.punctuation)).lower().split()
#         words = [englishStemmer.stem(word) for word in words if word not in en_stoplist]
        words = [wordnet_lemmatizer.lemmatize(word) for word in words if word not in en_stoplist]
        vocabulary.update(words)
#         print(words)
#     print(vocabulary.most_common(20))
#     break
print(vocabulary.most_common(20)) 

Loaded en stopwords, e.g. 'll, 'tis
[('uhm', 567), ('yeah', 226), ('cinnamon', 94), ('security', 81), ('airport', 74), ('tyre', 74), ('alcohol', 71), ('job', 65), ('pause', 60), ('consumption', 59), ('article', 55), ('search', 54), ('india', 54), ('country', 47), ('river', 46), ('dot', 44), ('biscuit', 43), ('outsourcing', 41), ('water', 41), ('type', 39)]


In [None]:
# get collection specific counts for IDF metric


In [101]:
# annotate dialogues with words
cursor = collection.find({})

# interate over conversations to collect traces and utterance labels
traces = []
for doc in cursor:
    _id = doc['_id']
    # record trace as the sequence of labels
    labels_traces = ['', '', '']
    roles, sentiments = '', ''
    vocabulary_traces = ['', '']

#     print(doc['query'])
    for j, turn in enumerate(doc['turns']):
        labels = turn['labels']
        words = turn['text'].translate(str.maketrans('', '', string.punctuation)).lower().split()
#         words = [englishStemmer.stem(word) for word in words if word not in en_stoplist]
        words = [wordnet_lemmatizer.lemmatize(word) for word in words if word not in en_stoplist]
        vocabulary.update(words)
        print(vocabulary.most_common(20))
#         print(words)
#         print(words)
#         if len(labels) > 2:
#             print(j)
        if 'cinnamon' in words:
            vocabulary_traces[0] += 'c'
        else:
            vocabulary_traces[0] += 'O'
        if 'biscuit' in words:
            vocabulary_traces[1] += 'b'
        else:
            vocabulary_traces[1] += 'O'
            
#         break
        for i, label in enumerate(labels):
#             print(label)
            if i >= len(labels_traces):
                labels_traces.append('O'*(len(labels_traces[0])-1))  # pad to previous length
            labels_traces[i] += labels2ids[label]
        
        # pad remaining traces
        if len(labels) < len(labels_traces):
            for i in range(len(labels), len(labels_traces)):
                labels_traces[i] += 'O'
        roles += turn['role'][0]
        sentiments += turn['sentiment']
    # save in mongo
    for trace in labels_traces:
#         print(trace)
        assert len(trace) == len(roles)
    fingerprint = {'labels': labels_traces, 'roles': roles, 'sentiments': sentiments, 'semantics': vocabulary_traces}
    doc['traces'] = fingerprint
#     collection.update_one({'_id': _id}, {"$set": doc}, upsert=True)
    traces.append(fingerprint)
#     break
print("%d traces collected"%len(traces))
# print("Sample trace: %s"%traces[15])
print("\n")
for key, value in traces[0].items():
    if type(value) == list:
        for label_trace in value:
            print(label_trace)
    else:
        print(value)

# for label_trace in traces[15]['labels']:
#     print(label_trace)
#     print(len(label_trace))
#     print([ids2labels[label_id] for label_id in label_trace[-1]])
# print(utterances[0])

[('uhm', 567), ('yeah', 226), ('cinnamon', 95), ('security', 81), ('airport', 74), ('tyre', 74), ('alcohol', 71), ('job', 65), ('pause', 60), ('consumption', 59), ('article', 55), ('search', 54), ('india', 54), ('country', 49), ('river', 46), ('dot', 44), ('biscuit', 43), ('outsourcing', 41), ('water', 41), ('type', 39)]
[('uhm', 568), ('yeah', 226), ('cinnamon', 95), ('security', 82), ('airport', 75), ('tyre', 74), ('alcohol', 71), ('job', 65), ('pause', 60), ('consumption', 59), ('article', 55), ('search', 54), ('india', 54), ('country', 49), ('river', 46), ('dot', 44), ('biscuit', 43), ('outsourcing', 41), ('water', 41), ('type', 40)]
[('uhm', 568), ('yeah', 226), ('cinnamon', 95), ('security', 82), ('airport', 75), ('tyre', 74), ('alcohol', 71), ('job', 66), ('pause', 60), ('consumption', 59), ('india', 55), ('article', 55), ('search', 54), ('country', 49), ('river', 46), ('dot', 44), ('biscuit', 43), ('outsourcing', 41), ('water', 41), ('type', 40)]
[('uhm', 569), ('yeah', 226), (