Count frequent subsequences that contain only unique symbols

In [5]:
# 1. load traces
import requests
# data_path = "https://raw.githubusercontent.com/svakulenk0/conversation_mining/master/data/"
# log_name = "1_dstc1.stringenc.txt"
data_path = "https://raw.githubusercontent.com/svakulenk0/conversation_mining/master/data/scs/"
log_name = "roles_scs.stringenc.txt"
log_string = requests.get(data_path+log_name).text
traces = ["<%s>" % t for t in log_string.split('\n')]
print("%d traces" % len(traces))
print(traces[0])

37 traces
<ABAABAABBABABBAABBA>


In [4]:
# 2. extract sequences frequent across multiple traces
# https://stackoverflow.com/questions/40556491/how-to-find-the-longest-common-substring-of-multiple-strings

from functools import partial, reduce
from itertools import chain
from typing import Iterator

from collections import Counter


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))


def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
    lengths = range(minn, maxn) if maxn else range(minn, len(seq))
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def frequent_ngrams(strings, min_support=None, topn=5):
    
    # 1.split traces into ngrams
    seqs_ngrams = map(allngram, strings)
    # 2.count ngram frequencies
    counts = Counter(chain.from_iterable(seqs_ngrams))
    
#     return counts.most_common(topn)
    # 3.filter frequent substrings
    # set frequency threshold if not specified
    if not min_support:
        most_frequent_s = [s for s, count in counts.most_common(topn)]
        # maximum frequency
#         most_frequent1 = counts.most_common(1)[0]
#         min_support = most_frequent1[1]
    else:
#         print(min_support)
        most_frequent={string: count for string, count in counts.items() if count >= min_support}
    #     print(most_frequent)
        most_frequent_s = list(most_frequent.keys())
    
    return most_frequent_s, [counts[s] for s in most_frequent_s]
    
    # 4.drop substrings
    #     most_frequent_s.sort(key=len, reverse=True)
    #     #     print(most_frequent_s)
    #     lfss = []
    #     for s in most_frequent_s:
    #         overlap = False
    #         for lfs in lfss:
    #             if s in lfs:
    #                 overlap = True
    #                 counts[lfs] += counts[s]
    #                 break
    #         if not overlap:
    #             lfss.append(s)
    #     # result: longest frequent substrings with counts
    #     return lfss, [counts[s] for s in lfss]

patterns, counts = frequent_ngrams(traces, topn=100)
print((patterns, counts))
# print(frequent_ngrams(traces, min_support=200))

(['A', 'B', 'AB', 'BA', 'BABA', 'BAB', 'ABAB', 'ABA', 'BABAB', 'ABABA', 'ABABAB', 'BABABA', 'BABABAB', 'ABABABA', 'ABABABAB', 'BABABABA', 'ABB', 'BBA', 'BB', 'BABB', 'ABABB', 'ABBA', 'BABABABAB', 'ABABABABA', 'BBABA', 'BBABAB', 'BABBA', 'ABABBA', 'BBAB', 'BABABB', 'ABBAB', 'ABBABAB', 'ABBABA', 'BABABBA', 'BABBABA', 'BABABBAB', 'ABABBAB', 'BABBABAB', 'BABABBABA', 'BABBAB', 'ABABBABA', 'BABABBABAB', 'ABABABABAB', 'ABABABB', 'ABABBABAB', 'AA', 'BAA', 'BABABABABA', 'ABABABBABA', 'ABABABBAB', 'ABABABBA', 'BAAB', 'AAB', 'BABABABABAB', 'ABABABABB', 'ABBABABA', 'BBABABA', 'BABABABB', 'ABABABBABAB', 'BABBABABA', 'AABA', 'ABAA', 'BAABA', 'BABABBABABA', 'ABABABABABAB', 'ABABBABABA', 'ABABABABABA', 'ABAAB', 'ABAABA', 'BABBABABAB', 'BAABABA', 'ABABABABABABAB', 'ABABABABBA', 'ABBABABAB', 'BABABABBA', 'BBABABAB', 'BABABABBAB', 'AABAB', 'BABABABBABA', 'BABABABABABABA', 'ABABABABBAB', 'BAABAB', 'BABABABABABAB', 'BABABABABABA', 'AABABA', 'ABABABABABABA', 'ABABABABBABA', 'ABABAA', 'BABAA', 'ABABBABABAB',

In [8]:
# group repeating chars into loops with () symbols
from collections import defaultdict

def frequent_loops(traces, topn=500):
    '''
    collect frequent patterns with loops
    '''
    # get frequent ngram patterns
    patterns, counts = frequent_ngrams(traces, topn=topn)
    
    loop_patterns = Counter()
    loop_patterns_num = {}
    loops = defaultdict(int)
    loop_ids = {}
    n_loops = 0
    for i, pattern in enumerate(patterns):
        loop_pattern, loop_pattern_num = "", ""
        for c in pattern:
            if c not in loop_pattern:
                loop_pattern += c
                loop_pattern_num += c
            else:
                loop_start_idx = loop_pattern.index(c)
                loop = ''.join([c for c in loop_pattern[loop_start_idx:] if c not in '()'])
                if len(loop) > 1:
                    loop_set = loop
    #                 loop_set = "".join(set(loop))
                    if loop_set not in loops:
                        n_loops += 1
                        loops[loop_set] += counts[i]
                        loop_ids[loop_set] = n_loops
                    loop_pattern = ''.join([c for c in loop_pattern[:loop_start_idx] if c not in '()']) + "(%s)" % loop
                    loop_pattern_num = ''.join([c for c in loop_pattern[:loop_start_idx] if c not in '()']) + str(loop_ids[loop_set])
    #     if not numeric and loop_pattern != pattern:
    #         print(pattern)
    #         print (loop_pattern)
    #         print('\n')
    #     if loop_pattern_num:
        loop_patterns_num[loop_pattern] = loop_pattern_num
        loop_patterns[loop_pattern] += counts[i]
#     print((list(loop_patterns.keys()), list(loop_patterns.values())))
#     print(loops)

    # show loop encoding
    print(loop_ids)
    patterns_w_loop_ids = [loop_patterns_num[p] if p in loop_patterns_num else p for p, c in loop_patterns.most_common()]
    counts = [c for p, c in loop_patterns.most_common()]
    return patterns_w_loop_ids, counts


a, b = frequent_loops(traces, topn=250)


# show extracted patterns
ptns = ''
cnts = ''
for i in a:
    ptns += i + ','
for i in b:
    cnts += str(i) + ','
    
print (ptns)
print (cnts)  

{'BA': 1, 'AB': 2}
1,2,<2,AB,BA,B,A,1>,2>,>,<A,<,<AB,B>,AB>,A>,BA>,
1477,1455,217,78,77,61,55,43,38,37,37,37,37,26,23,11,10,


In [9]:
# dump patterns
import csv
# a, b = frequent_ngrams(traces, topn=500)
a, b = frequent_loops(traces, topn=150)
with open("sample_frequent_loops.csv", 'w') as csvfile:
    results_writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    results_writer.writerow(a)
    results_writer.writerow(b)

{'BA': 1, 'AB': 2}
