### Implement a **contiguous sequential pattern mining** algorithm and apply it on text data to mine potential phrase candidates

In [59]:
from collections import Counter

def get_word_freq(documents):
    """Calculates the frequency of each word in the input document.
    """
    total_words = 0
    word_freq = Counter()
    active_indices = []
    for doc_index, doc in enumerate(documents):
        words = doc.split(' ')
        word_indices = []
        for word_index, word in enumerate(words):
            word_freq[word] += 1
            word_indices.append(word_index)
            total_words += 1
        active_indices.append(word_indices)

    return total_words, word_freq, active_indices

def frequent_pattern_mining(documents, min_support, max_phrase_size, word_freq, active_indices):
    """
    Performs frequent pattern mining to collect aggregate counts for all contiguous phrases in the 
    input document that satisfy a certain minimum support threshold.
    
    Input
    -----
    
    documents : list(str)
        the input corpus
    min_support : int
        minimum support threshold which must be satisfied by each phrase.
    max_phrase_size : int
        maximum allowed phrase size
    word_freq : dict-like
        raw frequency of each word in the input corpus
    active_indices : list(list(int))
        set of active indices
        
    Output
    ------
    
    """ 
    hash_counter = word_freq.copy()
    n = 2

    #iterate until documents is empty
    while(len(documents) > 0):
        temp_documents = []
        new_active_indices = []
        #go over each document
        for d_i,doc in enumerate(documents):
            #get set of indices of phrases of length n-1 with min support
            new_word_indices = []
            word_indices = active_indices[d_i]
            for index in word_indices:
                words = doc.split()
                if index+n-2 < len(words):
                    key = ""
                    for i in range(index, index+n-2+1):
                        if i == index+n-2:
                            key = key + words[i]
                        else:
                            key = key + words[i] + " "
                    
                    #check if the phrase 'key' meets min support
                    if (hash_counter[key] >= min_support):
                        new_word_indices.append(index)

            #remove the current document if there is no more phrases of length
            #n which satisfy the minimum support threshold
            if len(new_word_indices) != 0:
                new_active_indices.append(new_word_indices)
                temp_documents.append(doc)
                words = doc.split()
                for idx, i in enumerate(new_word_indices[:-1]):
                    phrase = ""
                    if (new_word_indices[idx+1] == i + 1):
                        for idx in range(i, i+n):
                            if idx == i+n-1:
                                phrase += words[idx]
                            else:
                                phrase += words[idx] + " "
                    hash_counter[phrase] += 1

        documents = temp_documents
        active_indices = new_active_indices
        n += 1
        if n == max_phrase_size:
            break

    hash_counter = Counter(x for x in hash_counter.elements() if hash_counter[x] >= min_support)

    return hash_counter 

In [70]:
with open('reviews_sample.txt', 'r') as f:
    data = f.read().split('\n')
data[:5]

['hoagie institution walking doe seem like throwback year ago old fashioned menu board booth large selection food speciality italian hoagie voted best area year year usually order burger patty obviously cooked frozen ingredient fresh overall good alternative subway road',
 'excellent food superb customer service miss mario machine used still great place steeped tradition',
 'yes place little dated opened weekend staff always pleasant fast make order always spot fresh veggie hoggies food also daily special ice cream really good banana split piled topping win pennysaver award ever year see',
 'food great best thing wing wing simply fantastic wet cajun best popular also like seasoned salt wing wing night monday wednesday night whole wing dining area nice family friendly bar nice well place truly yinzer dream pittsburgh dad would love place',
 'checked place past monday wing night heard wing great decided finally time check wing whole wing crispy nice change pace got wet cajun sauce garlic

In [93]:
docs = data
min_support = 100
max_phrase_size = 5

total_words, word_freq, active_indices = get_word_freq(docs)

vocab_size = len(word_freq)

hash_counter = frequent_pattern_mining(docs, min_support, max_phrase_size, word_freq, active_indices)
hash_counter.pop('')
len(hash_counter)

1176

In [94]:
with open('patterns.txt', 'w') as f:
    f.write(
        '\n'.join([f'{v}:{";".join(k.split())}' for (k, v) in
            sorted(hash_counter.items(), key=lambda item: item[1], reverse=True)])
    )