In [85]:
import numpy as np
import pandas as pd
from itertools import permutations 
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, Markdown
import os

# Import data

In [124]:
dir_path = "Data/"
sample_size = 100
tab_data=[]
for i in range(sample_size):
    with open(dir_path+os.listdir(dir_path)[i], encoding="ansi") as f:
        tab_data.append(f.read())

In [125]:
tab_data

['Finalists in the Apertura play-offs, Toluca had drawn their first two Clausura games but got off to a good start when Edgar Benitez put them ahead in the 16th minute.\nMatias Britos levelled 20 minutes later but Lucas Silva netted 14 minutes from the end to ensure the visitors took all three points.\n  \tFranco Arizala scored 13 minutes from time to ensure Jaguares claimed their first point with a 1-1 draw against Monterrey, who had opened the scoring through Aldo De Nigris (14).\n Hosts Jaguares also had Jorge Rodriguez sent off in the closing moments.',
 'City manager Roberto Mancini has consistently said his fellow Italian is not for sale throughout this month\'s transfer window but that has not quashed rumors linking him with the San Siro giants.\n  \tMilan have made their liking for the 22-year-old clear but have previously baulked at City\'s reported Â£28million valuation. Now fresh reports have emerged claiming negotiations between the clubs have begun but City\'s public messa

# Shingling function

In [126]:
def shingling (str_doc, k):
    res = set()
    for i in range (len(str_doc)-k+1):
        sample = str_doc[i:i+k]
        hashed_sample = hash(sample)
        res.add(hashed_sample)
    return res

In [127]:
k=10
tab_sh=[]
for data in tab_data:
    tab_sh.append(shingling(data,k))

In [128]:
tab_sh[0]

{-9206830668607761288,
 -9126466551611935689,
 -9089349694041961750,
 -9070547020759371852,
 -9043602735473345611,
 -9035681849588621673,
 -8983971306174954453,
 -8966356396933534088,
 -8964851310694733854,
 -8964477232653982541,
 -8934947744587087096,
 -8862063286634415586,
 -8818867058044565126,
 -8812064166291564234,
 -8735428191776511619,
 -8723162882401428090,
 -8673822434552833885,
 -8666215530971947219,
 -8648644409647847894,
 -8645305317801305824,
 -8639379473353040066,
 -8629943305198140945,
 -8605109518593115614,
 -8581003584725572024,
 -8570009158030747233,
 -8449171638324598337,
 -8446777644536931603,
 -8446574903092927548,
 -8443641248032091918,
 -8407176197860569822,
 -8393742953471444122,
 -8388822464880725033,
 -8319435482096300657,
 -8253120733249289573,
 -8033067884145008846,
 -8005150750297973189,
 -7998094343041155137,
 -7946423363815847779,
 -7917381989815695930,
 -7911933662472242847,
 -7884759844325826071,
 -7846215398105748191,
 -7802460325914083836,
 -779295344

# Compare Sets

In [129]:
def CompareSets(set1,set2):
    size_1, size_2, size_union = len(set1), len(set2), len(set1.union(set2)) # Cardinal (set1), Card(set2), Card(set1 u set2)
    size_inter = size_1 + size_2 - size_union # Card(set1 n set 2) = Card(set1) + Card (set2) - Card(set1 u set 2)
    return size_inter/size_union

In [131]:
CompareSets(tab_sh[70],tab_sh[1]),CompareSets(tab_sh[1],tab_sh[1])

(0.01418052904281429, 1.0)

# MinHashing

In [132]:
union_sh =set()
for sh in tab_sh:
    union_sh = union_sh.union(sh) #creation of a global set of all the values 
dict_sh={}
for i,hash_val in enumerate(list(union_sh)):
    dict_sh[hash_val]=i  # creation of a dictionary in order to link each shingle to a unique row in the matrix

In [133]:
matrix = np.zeros((len(union_sh), sample_size)) #creation of a well-sized matrix
for i,sh in enumerate(tab_sh):
    for hash_val in sh:
        matrix[dict_sh[hash_val]][i]=1 # fulling of the matrix

In [134]:
n = 100
tab_permutations =[]  #generation of n permutations of [0,1,....,N_sh] => equivalent to n hash functions
for i in range(n):
    tab_permutations.append(np.arange(len(union_sh)))
    np.random.shuffle(tab_permutations[i])

In [135]:
matrix_hashed = np.zeros((sample_size,n))
for i in range(n):
    for j in range(sample_size):
        ind = 0
        while (matrix[tab_permutations[i][ind]][j]==0):
            ind+=1
        matrix_hashed[j][i]=ind
matrix_hashed

array([[ 181.,   76.,  706., ...,  128.,  145.,   29.],
       [ 178.,   93.,   27., ...,   51.,   39.,   78.],
       [1454.,  693.,   43., ...,  102.,  572.,  606.],
       ...,
       [  46.,   47.,   48., ...,    2.,  135.,   33.],
       [ 202.,    8.,   61., ...,  125.,   46.,   89.],
       [  28.,   97.,   30., ...,   59.,   27.,   10.]])

# CompareSignatures

In [136]:
def CompareSignatures(mat_hashed,i,j):
    '''Comparison between row i and j of the MinHashed matrix'''
    similar = 0
    for k,a in enumerate(mat_hashed[i]):
        if (a==mat_hashed[j][k]):
            similar+=1
    return similar/len(mat_hashed[j])

In [138]:
for i in range(100):
    for j in range(i):
        print(i,j,CompareSignatures(matrix_hashed,i,j))

1 0 0.0
2 0 0.0
2 1 0.0
3 0 0.0
3 1 0.0
3 2 0.0
4 0 0.01
4 1 0.0
4 2 0.0
4 3 0.0
5 0 0.01
5 1 0.0
5 2 0.0
5 3 0.0
5 4 0.0
6 0 0.0
6 1 0.0
6 2 0.0
6 3 0.01
6 4 0.01
6 5 0.01
7 0 0.0
7 1 0.0
7 2 0.0
7 3 0.0
7 4 0.0
7 5 0.0
7 6 0.0
8 0 0.0
8 1 0.0
8 2 0.0
8 3 0.0
8 4 0.0
8 5 0.0
8 6 0.0
8 7 0.01
9 0 0.0
9 1 0.0
9 2 0.01
9 3 0.0
9 4 0.0
9 5 0.0
9 6 0.0
9 7 0.0
9 8 0.0
10 0 0.01
10 1 0.0
10 2 0.0
10 3 0.0
10 4 0.0
10 5 0.0
10 6 0.0
10 7 0.0
10 8 0.0
10 9 0.0
11 0 0.0
11 1 0.0
11 2 0.0
11 3 0.0
11 4 0.0
11 5 0.0
11 6 0.0
11 7 0.0
11 8 0.0
11 9 0.0
11 10 0.01
12 0 0.0
12 1 0.03
12 2 0.0
12 3 0.0
12 4 0.0
12 5 0.0
12 6 0.0
12 7 0.0
12 8 0.0
12 9 0.01
12 10 0.0
12 11 0.0
13 0 0.01
13 1 0.0
13 2 0.0
13 3 0.01
13 4 0.0
13 5 0.01
13 6 0.0
13 7 0.0
13 8 0.0
13 9 0.0
13 10 0.02
13 11 0.0
13 12 0.01
14 0 0.0
14 1 0.0
14 2 0.0
14 3 0.01
14 4 0.0
14 5 0.0
14 6 0.0
14 7 0.0
14 8 0.0
14 9 0.0
14 10 0.0
14 11 0.0
14 12 0.01
14 13 0.02
15 0 0.0
15 1 0.0
15 2 0.0
15 3 0.0
15 4 0.0
15 5 0.0
15 6 0.0
15 7 0.0

57 19 0.0
57 20 0.02
57 21 0.0
57 22 0.01
57 23 0.0
57 24 0.0
57 25 0.0
57 26 0.01
57 27 0.0
57 28 0.0
57 29 0.01
57 30 0.0
57 31 0.0
57 32 0.0
57 33 0.0
57 34 0.0
57 35 0.0
57 36 0.01
57 37 0.0
57 38 0.0
57 39 0.0
57 40 0.0
57 41 0.0
57 42 0.01
57 43 0.02
57 44 0.0
57 45 0.0
57 46 0.0
57 47 0.0
57 48 0.0
57 49 0.0
57 50 0.0
57 51 0.0
57 52 0.02
57 53 0.0
57 54 0.0
57 55 0.0
57 56 0.0
58 0 0.0
58 1 0.0
58 2 0.0
58 3 0.0
58 4 0.0
58 5 0.0
58 6 0.01
58 7 0.0
58 8 0.01
58 9 0.0
58 10 0.0
58 11 0.0
58 12 0.0
58 13 0.0
58 14 0.0
58 15 0.0
58 16 0.02
58 17 0.0
58 18 0.01
58 19 0.0
58 20 0.01
58 21 0.01
58 22 0.0
58 23 0.0
58 24 0.0
58 25 0.0
58 26 0.0
58 27 0.0
58 28 0.01
58 29 0.0
58 30 0.0
58 31 0.0
58 32 0.0
58 33 0.0
58 34 0.01
58 35 0.01
58 36 0.01
58 37 0.01
58 38 0.0
58 39 0.0
58 40 0.0
58 41 0.0
58 42 0.0
58 43 0.0
58 44 0.0
58 45 0.0
58 46 0.0
58 47 0.01
58 48 0.0
58 49 0.0
58 50 0.0
58 51 0.0
58 52 0.01
58 53 0.0
58 54 0.0
58 55 0.0
58 56 0.0
58 57 0.0
59 0 0.01
59 1 0.0
59 2 0.0
5

70 33 0.01
70 34 0.0
70 35 0.0
70 36 0.0
70 37 0.0
70 38 0.0
70 39 0.01
70 40 0.0
70 41 0.0
70 42 0.0
70 43 0.01
70 44 0.01
70 45 0.03
70 46 0.0
70 47 0.0
70 48 0.0
70 49 0.0
70 50 0.01
70 51 0.0
70 52 0.0
70 53 0.0
70 54 0.0
70 55 0.0
70 56 0.0
70 57 0.0
70 58 0.0
70 59 0.0
70 60 0.0
70 61 0.0
70 62 0.0
70 63 0.0
70 64 0.0
70 65 0.0
70 66 0.0
70 67 0.02
70 68 0.02
70 69 0.0
71 0 0.0
71 1 0.0
71 2 0.0
71 3 0.01
71 4 0.0
71 5 0.0
71 6 0.01
71 7 0.0
71 8 0.0
71 9 0.01
71 10 0.0
71 11 0.0
71 12 0.0
71 13 0.0
71 14 0.0
71 15 0.0
71 16 0.0
71 17 0.0
71 18 0.01
71 19 0.0
71 20 0.0
71 21 0.0
71 22 0.0
71 23 0.0
71 24 0.0
71 25 0.0
71 26 0.0
71 27 0.0
71 28 0.0
71 29 0.0
71 30 0.0
71 31 0.0
71 32 0.0
71 33 0.0
71 34 0.0
71 35 0.0
71 36 0.0
71 37 0.01
71 38 0.0
71 39 0.0
71 40 0.01
71 41 0.0
71 42 0.0
71 43 0.0
71 44 0.0
71 45 0.0
71 46 0.0
71 47 0.0
71 48 0.0
71 49 0.0
71 50 0.0
71 51 0.0
71 52 0.0
71 53 0.0
71 54 0.0
71 55 0.01
71 56 0.0
71 57 0.0
71 58 0.01
71 59 0.0
71 60 0.0
71 61 0.0
71 6

89 13 0.0
89 14 0.0
89 15 0.0
89 16 0.0
89 17 0.0
89 18 0.02
89 19 0.01
89 20 0.0
89 21 0.01
89 22 0.0
89 23 0.01
89 24 0.01
89 25 0.01
89 26 0.0
89 27 0.01
89 28 0.0
89 29 0.0
89 30 0.01
89 31 0.0
89 32 0.01
89 33 0.0
89 34 0.0
89 35 0.0
89 36 0.0
89 37 0.0
89 38 0.0
89 39 0.0
89 40 0.0
89 41 0.0
89 42 0.0
89 43 0.0
89 44 0.01
89 45 0.0
89 46 0.01
89 47 0.0
89 48 0.0
89 49 0.01
89 50 0.0
89 51 0.0
89 52 0.0
89 53 0.0
89 54 0.01
89 55 0.01
89 56 0.0
89 57 0.0
89 58 0.0
89 59 0.0
89 60 0.0
89 61 0.0
89 62 0.0
89 63 0.0
89 64 0.0
89 65 0.0
89 66 0.0
89 67 0.0
89 68 0.0
89 69 0.0
89 70 0.0
89 71 0.0
89 72 0.0
89 73 0.0
89 74 0.0
89 75 0.01
89 76 0.0
89 77 0.0
89 78 0.0
89 79 0.0
89 80 0.0
89 81 0.0
89 82 0.01
89 83 0.0
89 84 0.0
89 85 0.0
89 86 0.0
89 87 0.01
89 88 0.01
90 0 0.0
90 1 0.0
90 2 0.0
90 3 0.0
90 4 0.0
90 5 0.0
90 6 0.0
90 7 0.0
90 8 0.0
90 9 0.0
90 10 0.0
90 11 0.0
90 12 0.0
90 13 0.0
90 14 0.0
90 15 0.0
90 16 0.01
90 17 0.0
90 18 0.0
90 19 0.0
90 20 0.0
90 21 0.0
90 22 0.0
9

# LSH

In [None]:
t = 0.5
