# Frequent Sequence Mining

## Algorithm
### Suffix Array

In [3]:
test_string = 'banana\nanaK'
test_strings = test_string.split('\n')
print(test_strings)

['banana', 'anaK']


In [4]:
# suffix array
# https://louisabraham.github.io/notebooks/suffix_arrays.html

def suffix_array_oneliner(s):
    return [(suffix, rank) for suffix, rank in sorted((s[i:], i) for i in range(len(s)))]

def suffix_array_wo_sort(s):
    return [suffix for suffix in (s[i:] for i in range(len(s)))]

suffix_array_wo_sort(test_string)

['banana\nanaK',
 'anana\nanaK',
 'nana\nanaK',
 'ana\nanaK',
 'na\nanaK',
 'a\nanaK',
 '\nanaK',
 'anaK',
 'naK',
 'aK',
 'K']

### Frequent n-grams

k-common substrings problem (generalisation of Longest_common_substring)  
https://en.wikipedia.org/wiki/Longest_common_substring_problem

Find the most frequent (longest) substring O(n log n): suffix array -> longest common prefix (LCP) array (**suffix tree**)
https://mediatum.ub.tum.de/doc/1094574/1094574.pdf  
https://cs.stackexchange.com/questions/9555/computing-the-longest-common-substring-of-two-strings-using-suffix-arrays
http://www.roman10.net/2012/03/16/suffix-array-part-3-longest-common-substring-lcs/

In [6]:
# https://stackoverflow.com/questions/40556491/how-to-find-the-longest-common-substring-of-multiple-strings
from functools import partial, reduce
from itertools import chain
from typing import Iterator

from collections import Counter


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))


def allngram(seq: str, minn=1, maxn=None) -> Iterator[str]:
    lengths = range(minn, maxn) if maxn else range(minn, len(seq))
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def frequent_ngrams(strings, min_support=None, topn=5):
    # 1.split traces into ngrams
    seqs_ngrams = map(allngram, strings)

    # 2.count ngram frequencies
    counts = Counter(chain.from_iterable(seqs_ngrams))

    # 3.filter frequent substrings
    # set frequency threshold if not specified
    if not min_support:
        most_frequent_s = [s for s, count in counts.most_common(topn)]
        # maximum frequency
#         most_frequent1 = counts.most_common(1)[0]
#         min_support = most_frequent1[1]
    else:
#         print(min_support)
        most_frequent={string: count for string, count in counts.items() if count >= min_support}
    #     print(most_frequent)
        most_frequent_s = list(most_frequent.keys())

    # 4.drop substrings
    most_frequent_s.sort(key=len, reverse=True)
#     print(most_frequent_s)
    lfss = [most_frequent_s[0]]
    for s in most_frequent_s[1:]:
        overlap = False
        for lfs in lfss:
            if s in lfs:
                overlap = True
                break
        if not overlap:
            lfss.append(s)
    # result: longest frequent substrings with counts
    return lfss, [counts[s] for s in lfss]


print(frequent_ngrams(test_strings))

(['ana'], [2])


## Mine Conversation Log

In [23]:
# get conversation log
import requests
data_path = "../data/scs/"
traces_path = "original_scs.stringenc.txt"
log_string = requests.get(data_path+traces_path).text
# print(len(log_string))
# experiment on a log snippet
log_snip = log_string[:200]
print(log_snip)

traces = log_string.split('\n')
print('\n')
print("%d traces"%len(traces))
print(traces[0])

MissingSchema: Invalid URL '../data/scs/original_scs.stringenc.txt': No schema supplied. Perhaps you meant http://../data/scs/original_scs.stringenc.txt?

In [19]:
# dictionary: https://github.com/svakulenk0/conversation_mining/blob/master/data/1_dstc1.stringenc.dict.txt
print(frequent_ngrams(traces, topn=5))
# print(frequent_ngrams(traces, min_support=10000))  # TODO min support as a fraction of the number of traces
# TODO plot frequency distribution

TypeError: frequent_ngrams() got an unexpected keyword argument 'topn'

In [1]:
# TODO look up sample traces


In [27]:
# collect ngrams
from functools import partial, reduce
from itertools import chain
from typing import Iterator

from collections import Counter


def ngram(seq: str, n: int) -> Iterator[str]:
    return (seq[i: i+n] for i in range(0, len(seq)-n+1))

def allngram(seq: str, minn=4, maxn=5) -> Iterator[str]:
    lengths = range(minn, maxn) if maxn else range(minn, len(seq))
    ngrams = map(partial(ngram, seq), lengths)
    return set(chain.from_iterable(ngrams))


def frequent_ngrams(strings, min_support=2):
    # 1.split traces into ngrams
    seqs_ngrams = map(allngram, strings)

    # 2.count ngram frequencies
    counts = Counter(chain.from_iterable(seqs_ngrams))
    most_frequent={string: count for string, count in counts.items() if count >= min_support}
    sorted_patterns_keys = sorted(most_frequent, reverse=True, key=most_frequent.get)
    
    # load vocabulary to decode patterns
    vocabulary = {'<': 'START', '>': 'END'}
    # look-up default location
    vocabulary_path = traces_path.split('.')[0] + '.vocabulary.txt'
    with open(data_path+vocabulary_path) as v:
        for line in v.readlines():
#             print(line)
            symbol = line[0]
            label = line.strip('\n')[2:]
            vocabulary[symbol] = label
    
    for pattern in sorted_patterns_keys:
        print(' -> '.join([vocabulary[s] for s in pattern]), most_frequent[pattern])
#     return sorted_patterns_keys


frequent_ngrams(traces)

Scanning document without modification -> Information request within document -> Within-Document search result -> Information request within document 4
Information request within document -> Within-Document search result -> Information request within document -> Within-Document search result 4
Scanning document without modification -> Information request within document -> Within-Document search result -> Confirms 4
Initial information request -> Query refinement offer -> Intent clarification -> SERP without modification 3
Confirms -> Scanning document without modification -> Information request within document -> Within-Document search result 3
Access source -> Checks navigational command -> Confirms -> Scanning document without modification 3
Within-Document search result -> Information request within document -> Within-Document search result -> Information request within document 3
Confirms -> Scanning document without modification -> Confirms -> Confirms 3
Query refinement offer ->