# QRFA conformance checking
Extract the model from the MSDialog conversation transcripts annotated with intents https://ciir.cs.umass.edu/downloads/msdialog/ and compare it with the QRFA model.

In [15]:
# set up connection to the MongoDB: sudo service mongod start (27017 is the default port)
from pymongo import MongoClient

class Mongo_Connector():
    '''
    Wrapper class for some of the pymongo functions: http://api.mongodb.com/python/current/tutorial.html
    '''

    def __init__(self, db_name, col_name):
        # spin up database
        self.mongo_client = MongoClient()
        self.col_name = col_name
        self.db = self.mongo_client[db_name][self.col_name]

    def count_all_docs(self):
        count = self.db.count_documents({})
        print ("%d dialogues in %s" % (count, self.col_name)) 
        
db_name = 'cm'
col_name = 'msdialog_intent'
# connect to MongoDB
mongo = Mongo_Connector(db_name, col_name)
mongo.count_all_docs()

# define mapping from 12 msdialog_intent labels to QRFA
Q = ['OQ', 'RQ', 'CQ', 'FQ']
R = ['IR']
F = ['PF', 'NF']
A = ['PA']
# skipped FD GG JK O

2199 dialogues in msdialog_intent


In [51]:
# 1. get traces of functional labels from MongoDB
cursor = mongo.db.find()
# interate over conversations and collect traces
traces = []
for doc in cursor:
    # record trace as the sequence of labels
    trace = '<'
    for turn in doc['utterances']:
        # map labels to QRFA annotation schema
        labels = turn['tags'].split() 
        qrfa = [l[-1] for l in labels if l[-1] in 'QRFA']
        if qrfa:
            # consider only the first matched label
            label = qrfa[0]
        else:
            label = '*'
        # skip duplicate state self-loops
        if not trace or label != trace[-1]:
            trace += label
    if trace:
        traces.append(trace+'>')
print("%d traces collected"%len(traces))
print("Sample trace: %s"%traces[15:18])

2199 traces collected
Sample trace: ['<QAF>', '<QAQAF*>', '<QAQAQ>']


In [54]:
# 2. extract sequences frequent across multiple traces
# https://stackoverflow.com/questions/40556491/how-to-find-the-longest-common-substring-of-multiple-strings

from functools import partial, reduce
from itertools import chain
from typing import Iterator

from collections import Counter


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))


def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
    lengths = range(minn, maxn) if maxn else range(minn, len(seq))
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def frequent_ngrams(strings, min_support=None, topn=5):
    
    # 1.split traces into ngrams
    seqs_ngrams = map(allngram, strings)

    # 2.count ngram frequencies
    counts = Counter(chain.from_iterable(seqs_ngrams))

    # 3.filter frequent substrings
    # set frequency threshold if not specified
    if not min_support:
        most_frequent_s = [s for s, count in counts.most_common(topn)]
        # maximum frequency
#         most_frequent1 = counts.most_common(1)[0]
#         min_support = most_frequent1[1]
    else:
#         print(min_support)
        most_frequent={string: count for string, count in counts.items() if count >= min_support}
    #     print(most_frequent)
        most_frequent_s = list(most_frequent.keys())

    # 4.drop substrings
    most_frequent_s.sort(key=len, reverse=True)
#     print(most_frequent_s)
    lfss = [most_frequent_s[0]]
    for s in most_frequent_s[1:]:
        overlap = False
        for lfs in lfss:
            if s in lfs:
                overlap = True
                break
        if not overlap:
            lfss.append(s)
    # result: longest frequent substrings with counts
    return lfss, [counts[s] for s in lfss]


print(frequent_ngrams(traces, topn=50))
# print(frequent_ngrams(traces, min_support=200))
# @Anton check correctness + complexity analysis? is it a map-reduce scenario?

(['<QAQA', 'QAQA>', '<QAF', 'QAF>', 'QAQ>', '<QA*', 'QAFA', 'AF*>', 'QA*>', 'QAF*', '<QR', 'FA>', '*A', 'Q*', 'QF', 'RQ'], [322, 212, 494, 277, 225, 204, 130, 127, 125, 123, 287, 133, 264, 167, 148, 135])


In [43]:
# @Anton alternative algorithm: suffix array -> LCP array
def suffix_array_oneliner(s):
    return [(suffix, rank) for suffix, rank in sorted((s[i:], i) for i in range(len(s)))]

suffix_array_oneliner('!'.join(traces))
# TODO longest common prefix (LCP) array

[('!AF!QA!QRA!QOQRQAQ!QAQ!QARQA!QAFA!QAQA!QAFAFO!QAOAFA!QAQ!QAQR!QA!QAFA!QAFO!QAF!QAQA!QROQFO!QROA!QAQ!QAO!QAFQ!QAQA!QAF!QAF!QAOA!QA!QAQ!QAFQ!QAQA!QAQA!QAQ!QAQA!QA!QAQ!Q!QA!QAQA!QAOA!QAQA!QROQOAFO!QAQAF!QAFRFOA!QAFAFORA!QA!QAR!QAQA!QAQ!QAFA!QAOQ!QA!QAF!QAO!QAFO!QAQAQ!QAQ!QRORF!QAF!QOFO!QAO!QAQA!QAFO!QAQAQA!QAQO!QAQA!QAFA!QAQ!QAFO!QRFAQA!QAF!QARO!QAF!QAQOF!QAFO!QAQAFA!QAQA!QA!QA!QAFAOA!QAFOQAQA!QRQA!QAQAF!QAQA!QAQA!QROQA!QRA!QAQ!QAO!QAFOA!QAFO!QAQAOQAFO!FAQA!QRFO!QAFAQA!QA!QAFO!QAFAF!QFA!ROAO!QAFQA!QO!QAFO!AFO!QAQF!QAQAFO!QAQA!QAFO!QAOFOF!QAQ!QRQA!QAOA!QAF!QFAQ!QAQA!QFOAOF!QA!AFO!QAF!QAFAOAFAOF!QAQR!QA!QA!QAQOQ!QA!QFA!QAQA!QOF!QRFAOF!QOFA!QA!QAQO!QRQRQ!QAQAQAFO!QA!QAF!QAFA!QAQA!QAQA!QA!QOQ!QAQAFO!QA!QAQAQA!QFQAF!QFAFAF!QAQ!QR!QFAFA!QAFO!QAQF!QAO!QAFO!QFO!QAFO!QAF!QA!QAFQF!QFOAF!QA!QAFOQA!QO!QAO!QRFO!QAF!QRAQA!QAOA!QAF!QAQOA!QO!QAFOQ!QARA!QARAF!QAFAOAF!QA!QAOA!QAQ!QAQA!QAQA!QAQA!QAQA!QRAFOFA!QOAOAO!QAFO!QRQ!AOAQAOA!QA!QOR!QF!QROA!QA!QAFAFA!QAQOA!QAQ!QA!QA!QA!QAOA!QA!QOA!QA!QAQ!QAQA!QRFOA

In [None]:
# 3. aggregate frequent sequences
# https://en.wikipedia.org/wiki/Knapsack_problem LP or greedy optimization
# Constraints: (1) length - at most n components (2) completeness - each component must begin with a start symbol and end wi
# (2) at most k loops per component


In [None]:
# 4. compare the models