## Read

In [291]:
import pandas as pd
import numpy as np
from collections import Counter

In [292]:
reviews =pd.read_table('reviews_sample.txt', header=None, sep='\n')
reviews.shape

(10000, 1)

In [293]:
# setting minimum support
# count on each transaction
sup = 0.01
min_sup = int(reviews.shape[0]*sup)

In [294]:
# split as list
review_list = reviews[0].map(lambda review: review.split(' '))

## Length 1 pattern

In [295]:
from collections import defaultdict

db = defaultdict(list)

for sid, reviews in enumerate(review_list):
    for eid, review in enumerate(reviews):
        db[review].append([sid,eid])

In [296]:
db1 = defaultdict(pd.DataFrame)
for key, items in db.items():
    # compute support of items
    support = len(np.unique(list(zip(*items))[0]))
    if support > min_sup:
        db1[key] = pd.concat([db1[key], pd.DataFrame(items, columns=['sid','eid' ])], axis=1).set_index('sid')

## Length 2 patterns

In [198]:
# calculate whether other item is adjacent
def close(data, column1, column2):
    diff = np.absolute(data[column1]-data[column2])
    cond1 = diff<2
    cond2 = diff>0
    return data[cond1 & cond2] 

In [281]:
%%time
# 23min (without index)
# 23min (with index)

db2 = defaultdict(pd.DataFrame)
# copy db1
db1copy = db1.copy()

for key1, items1 in db1.items():
    for key2, items2 in db1copy.items():
        
        # merge based on key sid
        temp = items1.merge(items2, left_index=True, right_index=True, how='inner')
        # see if contigious
        temp = close(temp, 'eid_x', 'eid_y')
        
        # support check
        support = len(np.unique(temp.index))
        if support> min_sup:
            db2[key1+";"+key2] = temp
            
    # remove key from copy db
    db1copy.pop(key1)

Wall time: 23min 20s


In [282]:
db2.keys()

dict_keys(['like;place', 'like;really', 'like;would', 'like;look', 'like;looked', 'like;feel', 'like;felt', 'like;tasted', 'year;ago', 'selection;beer', 'food;good', 'food;service', 'food;great', 'food;whole', 'food;quality', 'best;one', 'cooked;perfectly', 'good;service', 'good;place', 'good;also', 'good;really', 'good;thing', 'good;pretty', 'good;price', 'good;get', 'customer;service', 'service;great', 'service;friendly', 'great;place', 'great;price', 'place;pittsburgh', 'place;love', 'place;get', 'place;one', 'staff;friendly', 'staff;wait', 'make;sure', 'ice;cream', 'really;nice', 'thing;one', 'night;last', 'night;saturday', 'pittsburgh;restaurant', 'would;definitely', 'would;recommend', 'time;first', 'time;last', 'time;every', 'time;long', 'time;next', 'better;much', 'definitely;back', 'come;back', 'pretty;much', 'lot;parking', 'hot;dog', 'fish;sandwich', 'fry;french', 'highly;recommend', 'price;reasonable', 'going;back', 'hour;happy', 'even;though', 'strip;district', 'across;stree

In [284]:
# divide length 2
db2split = defaultdict()
for key, items in db2.items():
    
    # items where first key comes first
    idx = items['eid_x']<items['eid_y']
    # case when x comes first
    support = len(np.unique(items[idx].index))
    if support > min_sup:
        db2split[key] = items[idx]
    
    # case when y comes first
    support = len(np.unique(items[~idx].index))
    if support> min_sup:
        keys = key.split(';')
        db2split[keys[1]+';'+keys[0]] = items[~idx]

## Length 3 Patterns

In [286]:
%%time
# length 3 db
db3 = defaultdict(pd.DataFrame)

for key1, items1 in db2split.items():
    for key2, items2 in db2split.items():
        # compare key1 first and key2 last
        key1last = key1.split(';')[1]
        key2first = key2.split(';')[0]
        key2last = key2.split(';')[1]
        
        # combine if keys are same
        if key1last==key2first:
            temp = items1.merge(items2, left_index=True, right_index=True, how='inner')
            # minimum support check
            support = len(np.unique(temp.index))
            if temp.shape[0] >min_sup:
                db3[key1+" "+key2last] = temp

Wall time: 76.8 ms


## Save

In [287]:
# combine all dbs
results = db1.copy()
results.update(db2)
results.update(db3)

In [297]:
with open("patterns.txt", 'w') as file:
    # length 1
    for key, items in db1.items():
        support = items.shape[0]
        file.write(str(support)+":"+';'.join(key.split())+ '\n')


In [432]:
a = 'one sees clearly only with the heart anything essential is invisible to the eyes'
b = 'let my soul smile through my heart and my heart smile through my eyes that I may scatter rich smiles in sad hearts'

a = a.split()
b = b.split()

unique_set = set(a).union(set(b))
matA = pd.DataFrame(None, index = unique_set).T
matB = pd.DataFrame(None, index = unique_set).T

In [433]:
matA.loc[0,:]=0
matB.loc[0,:]=0

In [434]:
for idx, column in enumerate(matA):
    for element in a:
        if element==column:
            matA.loc[0,column]+=1
            
for idx, column in enumerate(matB):
    for element in b:
        if element==column:
            matB.loc[0,column]+=1

In [435]:
np.dot(matA, matB.T)/(np.linalg.norm(matA)*np.linalg.norm(matB))

array([[0.11713032]])

In [436]:
x= [[6.9, 3.1],
[6.7,3.1],
[6.9,3.1],
[5.8,2.7],
[6.8,3.2]]

In [437]:
x1 = np.array(list(zip(*x))[0])
x2 = np.array(list(zip(*x))[1])

In [438]:
np.corrcoef(x1, x2)

array([[1.        , 0.95256461],
       [0.95256461, 1.        ]])