# Process Dialogue Datasets using MongoDB

* DSTC6 https://dbd-challenge.github.io/dbdc3/index.html
* MSDialog dataset https://ciir.cs.umass.edu/downloads/msdialog/

In [69]:
# set up connection to the MongoDB: sudo service mongod start (27017 is the default port)
from pymongo import MongoClient
import json

class Mongo_Connector():
    '''
    Wrapper class for some of the pymongo functions: http://api.mongodb.com/python/current/tutorial.html
    '''

    def __init__(self, db_name):
        # spin up database
        self.mongo_client = MongoClient()
        self.db = self.mongo_client[db_name]
        print("Connection success.")
    
    def count_all_docs(self, collection_name):
        count = self.db[collection_name].count_documents({})
        print ("%d dialogues in %s" % (count, collection_name))
    
    def bulk_load(self, collection_name, json_file_path, chunk_size=1000):
        '''
        Imports a big dataset from a single JSON file
        '''
        # load JSON
        with open(json_file_path) as f:
            all_docs = json.loads(f.read())
        
        n_docs = len(all_docs)
        print("Loading %d dialogues"%n_docs)
                
        # iterate over the docs
        chunk = []
        for _id, dialogue in all_docs.items():
            dialogue['_id'] = _id
            chunk.append(dialogue)
            if len(chunk) == chunk_size:
                # insert chunk   
                self.db[collection_name].insert_many(chunk)
                chunk = []
        # insert the last chunk
        if chunk:
            self.db[collection_name].insert_many(chunk)

        # show the doc counter
        self.count_all_docs(collection_name)


db_name = 'cm'
mongo = Mongo_Connector(db_name)

Connection success.


## MSDialog

In [71]:
loaded = True
if not loaded:
    # load MSDialog dataset
    msdialog_path = "/home/shared/cm_data/MSDialog/MSDialog-Complete.json"
    mongo.bulk_load('msdialog', msdialog_path)
else:
    mongo.count_all_docs('msdialog')

35536 dialogues in msdialog


In [76]:
# show a sampe dialogue
sample_doc = mongo.db['msdialog'].find_one()
for turn in sample_doc['utterances']:
    print ("%s: %s"% (turn['actor_type'], turn['utterance']))

User: I have Windows 10, version 1511, OS Build 10586.1106. For the past year I have tried to upgrade from this without success. Upgrades download OK but on installing only get to 85 - 93 % and then on restart install previous version of windows (the 1511 version) I have Windows update assistant installed. Any help or advice on this would be most welcome. Amber
Agent: >> I have Windows 10, version 1511, OS Build 10586.1106  Please answer-by-number, preferably without quoting my post:  1. When (approx. date) did you purchase the computer?  2. Did the computer come with Win10 preinstalled, did you do a clean install of Win10, or did you upgrade a Win7 computer or a Win8.1 computer [<=PICK ONE!] to Win10?  3. Who manufactured the computer (e.g., Dell; HP; Acer; Lenovo; ASUS)?  4. Has a Norton application or a McAfee application EVER been installed on the computer since you bought it?  5. Have you ever run the Norton Removal Tool and/or the McAfee Consumer Products Removal Tool?  6. Is it 

## DSTC6

In [74]:
# first 100 dialogues from /home/shared/cm_data/IRIS_json_data were inserted using the shell script utils/mongo_import.sh
mongo.count_all_docs('dstc6')

100 dialogues in dstc6


In [75]:
# show a sampe dialogue
sample_doc = mongo.db['dstc6'].find_one()
for turn in sample_doc['turns']:
    print ("%s: %s"% (turn['speaker'], turn['utterance']))

U: Hi Alex
S: Hi.
U: can you say something different?
S: How different?
U: very different please
S: What you reading?
U: harry potter
S: 'Mr. Potter...
U: yes, have you read it?
S: No. Not yet. Soon.
U: ok, good luck with it
S: You're right. It's OK. I just want it to be OK.
U: not necessarily, it can be wrong too
S: No. No way. Not a chance. I'd rather turn myself in than die.
U: you are an smart dude
S: But why fake your death? Why go through all of it?
U: because it is funny
S: He looks like a carrot, honey.
U: what!!!!
S: Zordon... he's gone.
