## Read

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
reviews =pd.read_table('reviews_sample.txt', header=None, sep='\n')
reviews.shape

(10000, 1)

In [3]:
# setting minimum support
# count on each transaction
sup = 0.01
min_sup = int(reviews.shape[0]*sup)

In [4]:
# split as list
review_list = reviews[0].map(lambda review: review.split(' '))

## Length 1 pattern

In [6]:
from collections import defaultdict

db = defaultdict(list)

for sid, reviews in enumerate(review_list):
    for eid, review in enumerate(reviews):
        db[review].append([sid,eid])

In [49]:
len(db.keys())

22104

In [47]:
len(np.unique(list(zip(*db.get('year')))[0]))

1085

In [55]:
%%time
db1 = defaultdict(pd.DataFrame)
for key, items in db.items():
    # compute support of items
    support = len(np.unique(list(zip(*items))[0]))
    if support >= min_sup:
        db1[key] = pd.concat([db1[key], pd.DataFrame(items, columns=['sid','eid' ])], axis=1).set_index('sid')

Wall time: 3.83 s


In [9]:
# save length 1(93/100)
with open('patterns.txt', 'w') as file:
    for key, value in db1.items():
        support = len(np.unique(value.index))
        file.write(str(support)+":"+str(key)+"\n")

## Length 2 patterns

In [57]:
# calculate whether other item is adjacent
def close(data, column1, column2):
    diff = np.absolute(data[column1]-data[column2])
    cond1 = diff<2
    cond2 = diff>0
    return data[cond1 & cond2] 

In [60]:
%%time
# 23min (without index)
# 21min (with index)

db2 = defaultdict(pd.DataFrame)
# copy db1
db1copy = db1.copy()

for key1, items1 in db1.items():
    for key2, items2 in db1copy.items():

        # merge based on key sid
        temp = items1.merge(items2, left_index=True, right_index=True, how='inner', suffixes=('_x', '_y'))
        
        # all rows smaller than minimum
        if temp.shape[0]<min_sup:
            pass
        
        # see if contigious
        temp = close(temp, 'eid_x', 'eid_y')
        
        # support check
        support = len(np.unique(temp.index))
        if support>= min_sup:
            db2[key1+";"+key2] = temp
            
    # remove key from copy db
    db1copy.pop(key1)

Wall time: 40min 5s


In [61]:
# divide length 2
db2split = defaultdict()
for key, items in db2.items():
    
    # items where first key comes first
    idx = items['eid_x']<items['eid_y']
    # case when x comes first
    support = len(np.unique(items[idx].index))
    if support >= min_sup:
        db2split[key] = items[idx]
    
    # case when y comes first
    support = len(np.unique(items[~idx].index))
    if support>= min_sup:
        keys = key.split(';')
        items.columns = ['eid_y', 'eid_x']
        db2split[keys[1]+';'+keys[0]] = items[~idx]

## Length 3 Patterns

In [62]:
%%time
# length 3 db
db3 = defaultdict(pd.DataFrame)

for key1, items1 in db2split.items():
    for key2, items2 in db2split.items():
        # compare key1 first and key2 last
        key1last = key1.split(';')[1]
        key2first = key2.split(';')[0]
        key2last = key2.split(';')[1]
        
        # combine if keys are same
        if key1last==key2first:
            temp = items1.merge(items2, left_index=True, right_index=True, how='inner')
            # minimum support check
            support = len(np.unique(temp.index))
            if temp.shape[0] >=min_sup:
                db3[key1+" "+key2last] = temp

Wall time: 71.8 ms


## Save

977 Length-1 <br>
63 Length-2

In [63]:
print(len(db1.keys()))
print(len(db2split.keys()))

977
63


In [64]:
# combine all dbs
results = db1.copy()
results.update(db2split)
# results.update(db3)

In [65]:
with open("patterns.txt", 'w') as file:
    # length 1
    for key, items in results.items():
        support = len(np.unique(items.index))
        file.write(str(support)+":"+';'.join(key.split())+ '\n')


In [None]:
a = 'one sees clearly only with the heart anything essential is invisible to the eyes'
b = 'let my soul smile through my heart and my heart smile through my eyes that I may scatter rich smiles in sad hearts'

a = a.split()
b = b.split()

unique_set = set(a).union(set(b))
matA = pd.DataFrame(None, index = unique_set).T
matB = pd.DataFrame(None, index = unique_set).T

In [None]:
matA.loc[0,:]=0
matB.loc[0,:]=0

In [None]:
for idx, column in enumerate(matA):
    for element in a:
        if element==column:
            matA.loc[0,column]+=1
            
for idx, column in enumerate(matB):
    for element in b:
        if element==column:
            matB.loc[0,column]+=1

In [None]:
np.dot(matA, matB.T)/(np.linalg.norm(matA)*np.linalg.norm(matB))

In [None]:
x= [[6.9, 3.1],
[6.7,3.1],
[6.9,3.1],
[5.8,2.7],
[6.8,3.2]]

In [None]:
x1 = np.array(list(zip(*x))[0])
x2 = np.array(list(zip(*x))[1])

In [None]:
np.corrcoef(x1, x2)