Count frequent subsequences that contain only unique symbols

In [13]:
# 1. load traces
import requests
# data_path = "https://raw.githubusercontent.com/svakulenk0/conversation_mining/master/data/"
# log_name = "1_dstc1.stringenc.txt"
data_path = "https://raw.githubusercontent.com/svakulenk0/conversation_mining/master/data/scs/"
log_name = "original_scs.stringenc.txt"
log_string = requests.get(data_path+log_name).text
traces = ["<%s>" % t for t in log_string.split('\n')]
print("%d traces" % len(traces))
print(traces[0])

37 traces
<zcTabTabgmcTc3Tacda>


In [14]:
# 2. extract sequences frequent across multiple traces
# https://stackoverflow.com/questions/40556491/how-to-find-the-longest-common-substring-of-multiple-strings

from functools import partial, reduce
from itertools import chain
from typing import Iterator

from collections import Counter


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))


def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
    lengths = range(minn, maxn) if maxn else range(minn, len(seq))
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def frequent_ngrams(strings, min_support=None, topn=5):
    
    # 1.split traces into ngrams
    seqs_ngrams = map(allngram, strings)
    # 2.count ngram frequencies
    counts = Counter(chain.from_iterable(seqs_ngrams))
    
#     return counts.most_common(topn)
    # 3.filter frequent substrings
    # set frequency threshold if not specified
    if not min_support:
        most_frequent_s = [s for s, count in counts.most_common(topn)]
        # maximum frequency
#         most_frequent1 = counts.most_common(1)[0]
#         min_support = most_frequent1[1]
    else:
#         print(min_support)
        most_frequent={string: count for string, count in counts.items() if count >= min_support}
    #     print(most_frequent)
        most_frequent_s = list(most_frequent.keys())
    
    return most_frequent_s, [counts[s] for s in most_frequent_s]
    
    # 4.drop substrings
    #     most_frequent_s.sort(key=len, reverse=True)
    #     #     print(most_frequent_s)
    #     lfss = []
    #     for s in most_frequent_s:
    #         overlap = False
    #         for lfs in lfss:
    #             if s in lfs:
    #                 overlap = True
    #                 counts[lfs] += counts[s]
    #                 break
    #         if not overlap:
    #             lfss.append(s)
    #     # result: longest frequent substrings with counts
    #     return lfss, [counts[s] for s in lfss]

patterns, counts = frequent_ngrams(traces, topn=100)
print((patterns, counts))
# print(frequent_ngrams(traces, min_support=200))

(['>', 'z', '<z', '<', '7', 'd', 'c', 'j', 'a', 'f', 'm', 'N', 'b', 'G', '3', 'ab', '4', 'T', 'GN', '7f', '7c', 'p', 'j7', 'fa', '5', 'cT', 'G7', 'jp', 'P', 'C', 'zj', '77', '<zj', 'S', '7d', 'da', 'aba', 'af', 'zG', '"', '<zG', 'G7c', 'ba', 'k', 'Nf', 'f7', '6', 'd3', 'mG', '7m', 'fab', 'K', 'H', 'zc', '<zc', 'mc', 'mj', 'l', 'a7', 'D', 'fm', 'zGN', 'NG', '<zGN', 'E', '1', 'j5', 'zjp', '<zjp', '7a', '7G', 'b7', 'md', 'c7', 'f7f', 'GNc', '7fa', 'D$', '$', 'Tf', 'Nj', 'Nc', 'fj', 'Nd', '17', 'dab', 'bm', 'b>', 'ab>', 'bab', '7ab', '<zS', 'zS', 'jK', 'GNf', 'fC', 'ab7', 'mf', 'dm', 'f3'], [37, 37, 37, 37, 28, 26, 25, 24, 23, 22, 21, 21, 20, 20, 18, 17, 15, 13, 13, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5])


In [15]:
# group repeating chars into loops with () symbols
from collections import defaultdict

def frequent_loops(traces, topn=500):
    '''
    collect frequent patterns with loops
    '''
    # get frequent ngram patterns
    patterns, counts = frequent_ngrams(traces, topn=topn)
    
    loop_patterns = Counter()
    loop_patterns_num = {}
    loops = defaultdict(int)
    loop_ids = {}
    n_loops = 0
    for i, pattern in enumerate(patterns):
        loop_pattern, loop_pattern_num = "", ""
        for c in pattern:
            if c not in loop_pattern:
                loop_pattern += c
                loop_pattern_num += c
            else:
                loop_start_idx = loop_pattern.index(c)
                loop = ''.join([c for c in loop_pattern[loop_start_idx:] if c not in '()'])
                if len(loop) > 1:
                    loop_set = loop
    #                 loop_set = "".join(set(loop))
                    if loop_set not in loops:
                        n_loops += 1
                        loops[loop_set] += counts[i]
                        loop_ids[loop_set] = n_loops
                    loop_pattern = ''.join([c for c in loop_pattern[:loop_start_idx] if c not in '()']) + "(%s)" % loop
                    loop_pattern_num = ''.join([c for c in loop_pattern[:loop_start_idx] if c not in '()']) + str(loop_ids[loop_set])
    #     if not numeric and loop_pattern != pattern:
    #         print(pattern)
    #         print (loop_pattern)
    #         print('\n')
    #     if loop_pattern_num:
        loop_patterns_num[loop_pattern] = loop_pattern_num
        loop_patterns[loop_pattern] += counts[i]
#     print((list(loop_patterns.keys()), list(loop_patterns.values())))
#     print(loops)

    # show loop encoding
    print(loop_ids)
    patterns_w_loop_ids = [loop_patterns_num[p] if p in loop_patterns_num else p for p, c in loop_patterns.most_common()]
    counts = [c for p, c in loop_patterns.most_common()]
    return patterns_w_loop_ids, counts


a, b = frequent_loops(traces, topn=250)


# show extracted patterns
ptns = ''
cnts = ''
for i in a:
    ptns += i + ','
for i in b:
    cnts += str(i) + ','
    
print (ptns)
print (cnts)  

{'ab': 1, 'f7': 2, 'ba': 3, '7G': 4, '7f': 5, 'NG': 6, 'GN': 7, 'aj': 8, 'cT': 9, '7mj': 10, '7cT1': 11}
7,>,z,<z,<,d,c,j,a,f,3,m,N,b,G,ab,4,T,GN,7f,7d,1,7c,p,j7,fa,5,f7,3,cT,G7,jp,P,C,zj,7m,<zj,S,da,af,zG,",<zG,G7c,ba,k,Nf,6,d3,mG,fab,K,H,zc,<zc,mc,mj,l,a7,D,fm,zGN,NG,<zGN,E,1,j5,zjp,<zjp,7a,7G,b7,md,c7,2,5,GNc,7fa,D$,$,Tf,Nj,Nc,fj,Nd,17,dab,bm,b>,ab>,7ab,<zS,zS,jK,GNf,fC,ab7,mf,dm,f3,76,7>,ad,g,c3,mj7,f1,5c,T17,T1,abm,Td,cC,),S>,Nfa,e,74,fab7,G5,o,zj5,d7,cG,7Z,Z,4m,d>,Tc,3T,Ta,mcT,c3T,Gl,s,mGN,7cT,<zGNc,b3,zGNc,3m,cTf,7P,7fab,jN,c>,T17f,cTd,7c>,17f,c),pc,bC,V,4,abC,mG5,K7,b7f,N7,dj,GNfa,fjK,f>,cj,7N,-,6",P-,y,bj,:,*,fod,od,Sj,"7,fo,eN,z7,<z7,z76,<z76,cw,w,6,fk,5H,<zj5,k7,dG,7d3,7,3d,HN,2,3>,4N,aj,ja,8,m7,7cm,cm,gm,ac,cTa,cd,9,zcT,<zcT,mG7c,T$7,mGNj,$7,baf,jpG,l7,X,Tfa,10,7mj,Gm,T$,1f,pG,$3,cT$,s4,G7cT,GNj,Tfab,37,%,mG7,D$3,t,cT$7,tN,NG7,11fa,
38,37,37,37,37,26,25,24,23,22,22,21,21,20,20,17,15,13,13,12,12,12,11,11,11,11,11,11,11,10,10,10,10,10,10,10,9,9,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,6

In [9]:
# dump patterns
import csv
# a, b = frequent_ngrams(traces, topn=500)
a, b = frequent_loops(traces, topn=150)
with open("sample_frequent_loops.csv", 'w') as csvfile:
    results_writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    results_writer.writerow(a)
    results_writer.writerow(b)

{'BA': 1, 'AB': 2}
