# Process Dialogue Datasets using MongoDB

* DSTC6 https://dbd-challenge.github.io/dbdc3/index.html
* MSDialog dataset https://ciir.cs.umass.edu/downloads/msdialog/

In [1]:
# set up connection to the MongoDB: sudo service mongod start (27017 is the default port)
from pymongo import MongoClient
import json

class Mongo_Connector():
    '''
    Wrapper class for some of the pymongo functions: http://api.mongodb.com/python/current/tutorial.html
    '''

    def __init__(self, db_name):
        # spin up database
        self.mongo_client = MongoClient()
        self.db = self.mongo_client[db_name]
        print("Connection success.")
    
    def count_all_docs(self, collection_name):
        count = self.db[collection_name].count_documents({})
        print ("%d dialogues in %s" % (count, collection_name))
    
    def bulk_load(self, collection_name, json_file_path, chunk_size=1000):
        '''
        Imports a big dataset from a single JSON file
        '''
        # load JSON
        with open(json_file_path) as f:
            all_docs = json.loads(f.read())
        
        n_docs = len(all_docs)
        print("Loading %d dialogues"%n_docs)
                
        # iterate over the docs
        chunk = []
        for _id, dialogue in all_docs.items():
            dialogue['_id'] = _id
            chunk.append(dialogue)
            if len(chunk) == chunk_size:
                # insert chunk   
                self.db[collection_name].insert_many(chunk)
                chunk = []
        # insert the last chunk
        if chunk:
            self.db[collection_name].insert_many(chunk)

        # show the doc counter
        self.count_all_docs(collection_name)


db_name = 'cm'
mongo = Mongo_Connector(db_name)

Connection success.


## MSDialog

In [2]:
loaded = True
if not loaded:
    # load MSDialog dataset
    msdialog_path = "/home/shared/cm_data/MSDialog/MSDialog-Complete.json"
    mongo.bulk_load('msdialog', msdialog_path)
else:
    mongo.count_all_docs('msdialog')

ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused

In [None]:
# show a sampe dialogue
sample_doc = mongo.db['msdialog'].find_one()
for turn in sample_doc['utterances']:
    print ("%s: %s"% (turn['actor_type'], turn['utterance']))

## DSTC6

In [None]:
# first 100 dialogues from /home/shared/cm_data/IRIS_json_data were inserted using the shell script utils/mongo_import.sh
mongo.count_all_docs('dstc6')

In [None]:
# show a sampe dialogue
sample_doc = mongo.db['dstc6'].find_one()
for turn in sample_doc['turns']:
    print ("%s: %s"% (turn['speaker'], turn['utterance']))