# QRFA conformance checking
Extract the model from the MSDialog conversation transcripts annotated with intents https://ciir.cs.umass.edu/downloads/msdialog/ and compare it with the QRFA model.

In [None]:
# set up connection to the MongoDB: sudo service mongod start (27017 is the default port)
from pymongo import MongoClient

class Mongo_Connector():
    '''
    Wrapper class for some of the pymongo functions: http://api.mongodb.com/python/current/tutorial.html
    '''

    def __init__(self, db_name, col_name):
        # spin up database
        self.mongo_client = MongoClient()
        self.col_name = col_name
        self.db = self.mongo_client[db_name][self.col_name]

    def count_all_docs(self):
        count = self.db.count_documents({})
        print ("%d dialogues in %s" % (count, self.col_name)) 
        
db_name = 'cm'
col_name = 'msdialog_intent'
# connect to MongoDB
mongo = Mongo_Connector(db_name, col_name)
mongo.count_all_docs()

# define mapping from 12 msdialog_intent labels to QRFA
Q = ['OQ', 'RQ', 'CQ', 'FQ']
R = ['IR']
F = ['PF', 'NF']
A = ['PA']
# skipped FD GG JK O

In [None]:
# 1. get traces of functional labels from MongoDB
cursor = mongo.db.find()
# interate over conversations and collect traces
traces = []
for doc in cursor:
    # record trace as the sequence of labels
    trace = '<'
    for turn in doc['utterances']:
        # map labels to QRFA annotation schema
        labels = turn['tags'].split() 
        qrfa = [l[-1] for l in labels if l[-1] in 'QRFA']
        if qrfa:
            # consider only the first matched label
            label = qrfa[0]
        else:
            label = '*'
        # skip duplicate state self-loops
        if not trace or label != trace[-1]:
            trace += label
    if trace:
        traces.append(trace+'>')
print("%d traces collected"%len(traces))
print("Sample trace: %s"%traces[15:18])

In [None]:
# 2. extract sequences frequent across multiple traces
# https://stackoverflow.com/questions/40556491/how-to-find-the-longest-common-substring-of-multiple-strings

from functools import partial, reduce
from itertools import chain
from typing import Iterator

from collections import Counter


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))


def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
    lengths = range(minn, maxn) if maxn else range(minn, len(seq))
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def frequent_ngrams(strings, min_support=None, topn=5):
    
    # 1.split traces into ngrams
    seqs_ngrams = map(allngram, strings)
    # 2.count ngram frequencies
    counts = Counter(chain.from_iterable(seqs_ngrams))
    
#     return counts.most_common(topn)
    # 3.filter frequent substrings
    # set frequency threshold if not specified
    if not min_support:
        most_frequent_s = [s for s, count in counts.most_common(topn)]
        # maximum frequency
#         most_frequent1 = counts.most_common(1)[0]
#         min_support = most_frequent1[1]
    else:
#         print(min_support)
        most_frequent={string: count for string, count in counts.items() if count >= min_support}
    #     print(most_frequent)
        most_frequent_s = list(most_frequent.keys())
    
    return most_frequent_s, [counts[s] for s in most_frequent_s]
    
    # 4.drop substrings
    #     most_frequent_s.sort(key=len, reverse=True)
    #     #     print(most_frequent_s)
    #     lfss = []
    #     for s in most_frequent_s:
    #         overlap = False
    #         for lfs in lfss:
    #             if s in lfs:
    #                 overlap = True
    #                 counts[lfs] += counts[s]
    #                 break
    #         if not overlap:
    #             lfss.append(s)
    #     # result: longest frequent substrings with counts
    #     return lfss, [counts[s] for s in lfss]

patterns, counts = frequent_ngrams(traces, topn=100)
print((patterns, counts))
# print(frequent_ngrams(traces, min_support=200))
# @Anton check correctness + complexity analysis? is it a map-reduce scenario?

In [26]:
# group repeating chars into loops with () symbols
from collections import defaultdict

def frequent_loops(traces, topn=500):
    '''
    collect frequent patterns with loops
    '''
    # get frequent ngram patterns
    patterns, counts = frequent_ngrams(traces, topn=topn)
    
    loop_patterns = Counter()
    loop_patterns_num = {}
    loops = defaultdict(int)
    loop_ids = {}
    n_loops = 0
    for i, pattern in enumerate(patterns):
        loop_pattern, loop_pattern_num = "", ""
        for c in pattern:
            if c not in loop_pattern:
                loop_pattern += c
                loop_pattern_num += c
            else:
                loop_start_idx = loop_pattern.index(c)
                loop = ''.join([c for c in loop_pattern[loop_start_idx:] if c not in '()'])
                if len(loop) > 1:
                    loop_set = loop
    #                 loop_set = "".join(set(loop))
                    if loop_set not in loops:
                        n_loops += 1
                        loops[loop_set] += counts[i]
                        loop_ids[loop_set] = n_loops
                    loop_pattern = ''.join([c for c in loop_pattern[:loop_start_idx] if c not in '()']) + "(%s)" % loop
                    loop_pattern_num = ''.join([c for c in loop_pattern[:loop_start_idx] if c not in '()']) + str(loop_ids[loop_set])
    #     if not numeric and loop_pattern != pattern:
    #         print(pattern)
    #         print (loop_pattern)
    #         print('\n')
    #     if loop_pattern_num:
        loop_patterns_num[loop_pattern] = loop_pattern_num
        loop_patterns[loop_pattern] += counts[i]
#     print((list(loop_patterns.keys()), list(loop_patterns.values())))
#     print(loops)
    print(loop_ids)
    patterns_w_loop_ids = [loop_patterns_num[p] if p in loop_patterns_num else p for p, c in loop_patterns.most_common()]
    counts = [c for p, c in loop_patterns.most_common()]
    return patterns_w_loop_ids, counts

method_mining= ['ABBBCCDEFGHIJKLMMM', 'ANNOPQQRSHKTMTU', 'CDBCEVWXNQPIYIBBYZJKKXHKKS[KL[I[V[LT\\]]^X_`U', 'PNNNGabSTcT', 'KVEdFBI[eFPSSKHKYTEKX_[`', 'fXKJDWVVgBBPNNNSSSSKJKEWLLLMT]^MUU', 'ABShYLPiQQKKVMMgU', 'QDXQBFPNGVSSYYKKTKT]MU', 'AAjAOIPS[KLLfL', 'kSKMU', 'FPSSKhK', 'BJKTKM', 'AZDlKEOQPGYKMTU', 'CVDBPSHGISRJKKTXljMM[]`U', 'jdAmdZgmVZAADDOQYYFKKGCBjVKKFKccK^lEETTT^LXAnK_]`o_fXXWK[LFWM]M[MUZU', 'ODSPGKY', 'VdSICVVdDPSGK[EYKKET[ETM', 'AKOoBXfjVPHSAJKKTX^[LM]IUUp', 'CVFPSB[[[', 'CV`fj`VFSeeYSPBY`KW[[`MW', 'ADXBkAJEKBVVSKE[Wo[mDMSc', 'OCJ`A`fkVXfE[jVI`_J[n[[MqU', 'CSNHBrQHVFPSKYJKKj]MTU', 'AfKJCXVkQBPSSSHKKTLlMlM]TU', 'CYeeYSHBfKI][[[', 'fd`ZQAHCVBIFPSNH^KHTMU', 'fd`ZQAHCVBIFPFKccTMU', 'VVd[DZIABBTTKcLLEI[TM[MT', 'VAaONCCJsPSmAbSK[LMMcTUUrq', 'HHNNJKITKT[M', 'mXBXOGCFSSJmAEbKKIccMf_]TTU', 'ADA[`WX``NQQYBtuQFaPSK^^\\\\L[KLTLLMM]]M]q[', 'fSmABVd`kDPNIGNaiClFSYYKKBaIL[ML`EEll[M[i]TKMU', 'AACKVFHDBQPSKSKKATK^m[MMU', 'fOdADCSQQCEdVkFBNPSH[JYHKTTm][TToccE[]M[MTTMUUU', 'HOVHKHPIK[TT^_', 'meFSFQSS[KKTTTTB[``fX]]\\MUTT[^KUU', 'dfACBVPGHITKAjM]lM_TKU', 'FIFFPeSSKSKKTKMU', 'ABBBOFCHJQd`[Mc\\TU', 'mQAOooHFPGFC[dEKKVSYKYL[DrSIWT]XU', 'VNNQOCFPHYBHHK[[TKU', 'AlZmDaCCFSJKccTTYTU', 'CNQFVVPGKSKT[MM', 'fVkWXSS[KAmJBKTc[jK', 'KCdDPSKMU', 'DfYDSGSbeKKMU', 'mVVVdDCXBZJOPGKTLMM[TUKUUp', 'FPcebbSccTM]', 'NZSeFNPCCKKM^', 'ABDCGSJlKL', 'eHHvPAbebbHHKKTKccTT]MT', 'KQDOaSCNPHSHYKU', 'NZZFPSYAKAAjKXLKj[MM]TU', 'AAZAADBBosjYaFFPPGSYIKKTLT[MjXMM]MTrABB[', 'kPeKkKcccK', 'PDKKKTTTcc', 'ePFIIIGKIGISSGHKKTM', 'AfGYHg[jJVff`XXdfTqU', 'IFFePBGrSHHHKKlKK]YU', 'fsVXdWaeeSHK[[]XMTU', 'eOAaPNASSKKKcccKvvvLLM', 'ZHYCADJjeSPHbSHSTTcKTK[KMT', 'SHH]KSKcTTTwM', 'dVDXFPGDVSYYKKK]M', 'VAAPHKK', 'VFHKdPSSSHK[XTMr', 'CDZVVeeeeekDWSSjH`aPJLSHKK^sKLLX]^IT^MUQqT^X', 'bDjDQGIeSHSSSKKTTcUE', 'aaCPSGNNGSHKL]K', 'ZCPGKTKU', 'VQIPGSSKLLLTMIU', 'HHo`CKljMcZYp', 'ABYABsjYaFPPGSbYBIKKTLT[MjI]XMIKrrAI[[', 'kkOaSKToMXq', 'VjfAQqaCSKCIGSHHKTj[[TTp', 'fDkVZAOCIYDPbKYcTMMKT', 'HVFFFNeSPHSHSSKKTTK[[pKTK^_X_', 'CePIIGYYHHSKKM^U', 'VV]kXCEVdaFDFFPGSKSKhI^KKU]UM^^SYd', 'VDPHeebbcGKKT', 'VXfjBPGHHBI[sN`_X[j[UU', 'jFePSKKMU', 'ACYDDPGSQIIHxKKKKT][TTMXpD', 'ADASZeeeHSPHGSHKlMEMIrT', 'eeCaeePHSSHKKKMMMU', 'yHDrrrrePNbbPJKKcTc]', 'VDPSGSK[TTMTUXUD', 'HVDsVkNQIVPISGHHHK[BsMTXpD', 'CPBIGHHHKcMRM', 'FeeNPPKcTIM', 'ZDjSSaZFPvKKl[M[rrrrUKXp']
for i in range(len(method_mining)):
    method_mining[i] = '<' + method_mining[i] + '>'
print(method_mining[0])

a, b = frequent_loops(method_mining, topn=250)
# print((a, b))

ptns = ''
cnts = ''
for i in a:
    ptns += i + ','
for i in b:
    cnts += str(i) + ','
    
print (ptns)
print ('-------------------')
print (cnts)  


<ABBBCCDEFGHIJKLMMM>
{'KT': 1, 'KS': 2, 'M[': 3}
K,<,>,S,T,M,P,H,U,V,KT,[,C,A,B,U>,F,SK,D,I,G,Y,c,HK,],e,X,N,Q,L,J,PS,f,j,<A,O,SH,Z,d,E,a,FP,KM,Kc,YK,`,MU,b,JK,^,PG,cT,LM,TM,TK,k,[M,[T,TU,MT,l,m,<V,AB,M>,MU>,K[,GS,KL,W,<C,r,SY,1,TU>,_,PH,SHK,<f,]M,M],o,HS,p,Vd,T>,M[,AD,GK,DP,q,s,KU,cK,GH,KS,[K,PN,Tc,BI,FPS,KMU>,KMU,CV,T[,SG,VF,eP,SYK,<AB,MTU,NQ,LT,I[,KY,T],K>,VD,K^,SP,p>,[>,FS,PSK,TL,<H,IG,2,eS,PGS,MTU>,S[K,PI,[L,S[,KX,DB,\,L[,bS,KV,[`,BP,g,KE,VS,<F,PGK,VP,PHS,[j,]T,KTL,Vk,KI,IF,d`,T[M,<m,KA,NP,KU>,HK[,<e,Kl,CD,R,HKT,CE,WX,]^,QP,X_,[KL,BY,NG,eF,EK,<K,KH,fX,h,DX,Aj,AO,<k,TKM,GY,lK,jM,TX,<CV,IS,GI,[],jV,ZA,T^,3,DPS,
-------------------
127,92,92,88,82,76,69,55,53,52,51,50,45,43,43,42,41,38,37,36,36,36,35,34,33,33,32,30,29,29,23,23,22,21,21,19,18,18,18,17,17,17,17,17,17,16,16,16,15,14,14,14,14,13,13,13,13,13,12,12,12,12,12,12,11,11,11,11,11,10,10,10,10,10,9,9,9,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4

In [None]:
# dump patterns
import csv
# a, b = frequent_ngrams(traces, topn=500)
a, b = frequent_loops(traces, topn=150)
with open("sample_frequent_loops.csv", 'w') as csvfile:
    results_writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    results_writer.writerow(a)
    results_writer.writerow(b)

In [None]:
# @Anton alternative algorithm: suffix array -> LCP array
def suffix_array_oneliner(s):
    return [(suffix, rank) for suffix, rank in sorted((s[i:], i) for i in range(len(s)))]

suffix_array_oneliner('!'.join(traces))[:2]
# TODO longest common prefix (LCP) array

In [None]:
# 3. aggregate frequent sequences
# https://en.wikipedia.org/wiki/Knapsack_problem LP or greedy optimization
# Constraints: (1) size - at most n patterns; (2) length (tree depth) - at most k symbols per pattern;
# (3) completeness - each component must begin with a start symbol and end with the end symbol;
# (4) at most k loops per component


In [None]:
# 4. compare the models

In [None]:
# Use algorithms: 
