In [1]:
import pandas as pd
from typing import List
import numpy as np
from scipy import sparse
import sys

df = pd.read_csv('../ML/data/spam_ham_dataset.csv')

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(ngram_range=(2,2))

In [3]:
matrix = countvec.fit_transform(df['text'])

In [109]:
def getSignature(matrix: sparse.csc_matrix, n_sign: int, random_seed: int = 34):
    c = 100000007
    np.random.seed(random_seed)
    rand_num = np.random.randint(0,sys.maxsize, (n_sign,2))
    
    n_vocab = matrix.shape[1]
    n_docs = matrix.shape[0]
    signs = [[np.inf for i in range(n_docs)] for j in range(n_sign)]
    
    rows, cols = matrix.nonzero()
    data = matrix.data
    
    for i in range(n_sign):
        a, b = rand_num[i][0] % c, rand_num[i][1] % c
        for i_doc, i_vocab, d in zip(rows, cols, data):
            if signs[i][i_doc] is None:
                signs[i][i_doc] = (a * i_vocab + b) % c
            else:
                signs[i][i_doc] = min(signs[i][i_doc], (a * i_vocab + b) % c)
    return np.array(signs)

def getJaccardSimilarity(signs: List[List[int]], doc1: int, doc2: int):
    return (signs[:,doc1]==signs[:,doc2]).mean()

In [110]:
signs = getSignature(matrix, 10)

In [111]:
np.where(np.isinf(signs.astype(float)))[1].reshape((10,-1))

array([[ 154,  182,  296,  363, 1130, 1279, 1369, 2184, 2538, 2665, 2680,
        2903, 3006, 3610, 4081, 4192, 4748],
       [ 154,  182,  296,  363, 1130, 1279, 1369, 2184, 2538, 2665, 2680,
        2903, 3006, 3610, 4081, 4192, 4748],
       [ 154,  182,  296,  363, 1130, 1279, 1369, 2184, 2538, 2665, 2680,
        2903, 3006, 3610, 4081, 4192, 4748],
       [ 154,  182,  296,  363, 1130, 1279, 1369, 2184, 2538, 2665, 2680,
        2903, 3006, 3610, 4081, 4192, 4748],
       [ 154,  182,  296,  363, 1130, 1279, 1369, 2184, 2538, 2665, 2680,
        2903, 3006, 3610, 4081, 4192, 4748],
       [ 154,  182,  296,  363, 1130, 1279, 1369, 2184, 2538, 2665, 2680,
        2903, 3006, 3610, 4081, 4192, 4748],
       [ 154,  182,  296,  363, 1130, 1279, 1369, 2184, 2538, 2665, 2680,
        2903, 3006, 3610, 4081, 4192, 4748],
       [ 154,  182,  296,  363, 1130, 1279, 1369, 2184, 2538, 2665, 2680,
        2903, 3006, 3610, 4081, 4192, 4748],
       [ 154,  182,  296,  363, 1130, 1279, 1369

In [112]:
signs = np.nan_to_num(signs)

In [120]:
similarity = []
for i in range(len(signs[0])):
    similarity.append((signs - signs[:,i].reshape((-1,1))==0).mean(axis=0))

In [132]:
df.loc[1, 'text']

'Subject: hpl nom for january 9 , 2001\r\n( see attached file : hplnol 09 . xls )\r\n- hplnol 09 . xls'

In [134]:
df.loc[3011, 'text']

'Subject: hpl nom for january 4 , 2001\r\n( see attached file : hplnol 04 . xls )\r\n- hplnol 04 . xls'

In [131]:
sparse_sim = sparse.csr_matrix(similarity)
rows, cols = sparse_sim.nonzero()
for r, c, d in zip(rows, cols, sparse_sim.data):
    if d > 0.8:
        print(r,c)

0 0
1 1
1 362
1 3011
2 2
3 3
4 4
5 5
5 63
6 6
6 2838
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15
16 16
16 2555
16 2926
16 3321
17 17
18 18
19 19
20 20
21 21
22 22
23 23
23 810
24 24
25 25
26 26
27 27
27 3086
28 28
29 29
30 30
30 3347
31 31
31 43
31 497
32 32
33 33
34 34
35 35
36 36
37 37
38 38
39 39
40 40
41 41
42 42
42 5168
43 31
43 43
44 44
45 45
45 4797
46 46
47 47
48 48
49 49
50 50
51 51
51 2261
52 52
53 53
54 54
55 55
56 56
57 57
58 58
58 4568
59 59
60 60
61 61
62 62
63 5
63 63
64 64
65 65
66 66
66 3019
67 67
68 68
69 69
70 70
71 71
71 1211
72 72
73 73
74 74
75 75
76 76
77 77
77 3298
78 78
78 2730
79 79
80 80
81 81
82 82
83 83
83 1274
84 84
85 85
86 86
87 87
88 88
89 89
90 90
91 91
92 92
93 93
94 94
95 95
96 96
97 97
97 348
97 793
97 3209
97 3246
97 3516
97 3716
97 3853
98 98
99 99
99 974
100 100
101 101
102 102
103 103
104 104
105 105
106 106
107 107
108 108
109 109
110 110
111 111
112 112
113 113
114 114
114 4865
115 115
116 116
116 2202
117 117
118 118
119 119
120 120
120 1

In [43]:
(signs[0] == None)

array([False, False, False, ..., False, False, False])

In [33]:
signs[0].reshape(1,-1).shape

(1, 5171)

In [30]:
signs.transpose() - signs.transpose()[:,0].reshape(-1,1)

TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'

In [17]:
res = np.zeros((matrix.shape[0], matrix.shape[1]))
for i in range(matrix.shape[0]):
    for j in range(matrix.shape[0]):
        res[i, j] = getJaccardSimilarity(signs, i, j)

KeyboardInterrupt: 

In [130]:

def getSignature(matrix: List[List[int]], n_sign: int, random_seed: int = 34):
    rng = np.random.default_rng(random_seed)
    n_vocab = len(matrix)
    n_docs = len(matrix[0])
    
    perms = rng.permuted(np.tile(np.arange(n_vocab), n_sign).reshape(n_sign,n_vocab), axis=1)
    signs = [[None for i in range(n_docs)] for j in range(n_sign)]
    d = {j: {i: True for i in range(n_docs)} for j in range(n_sign)}
    # count = {i: 0 for i in range(n_sign)}
    idx = 0
    while len(d) > 0 or idx < n_vocab:
        signs_list = list(d.keys())
        for i_sign in signs_list:
            i_vocab = perms[i_sign][idx]
            docs_list = list(d[i_sign].keys())
            for i_doc in docs_list:
                if matrix[i_vocab][i_doc] > 0:
                    signs[i_sign][i_doc] = idx
                    del d[i_sign][i_doc]
                    if len(d[i_sign])==0:
                        del d[i_sign]
        idx+=1
    return np.array(perms), np.array(signs)
        

In [None]:
def getSignature(matrix: List[List[int]], n_sign: int, random_seed: int = 34):
    c = 100000007
    np.random.seed(random_seed)
    np.random.randint(0,sys.maxsize, (n_sign,2))
    
    n_vocab = len(matrix)
    n_docs = len(matrix[0])
    signs = [[None for i in range(n_docs)] for j in range(n_sign)]
    d = {j: {i: True for i in range(n_docs)} for j in range(n_sign)}
    # count = {i: 0 for i in range(n_sign)}
    
    return np.array(perms), np.array(signs)

In [64]:
matrix = [[1,0,0,0,1,1,0],[0,0,1,1,0,1,1],[0,0,0,1,0,0,1],[1,0,0,1,0,1,0],[0,0,0,1,1,0,0],[1,0,0,0,1,0,0],[1,0,0,0,0,1,0],[0,0,0,1,0,0,1],[0,0,0,0,0,0,1],[1,1,0,0,0,0,0]]

In [132]:
sp_m = sparse.csr_matrix(matrix)

In [142]:
for x in sp_m:
    print(x)

  (0, 0)	1
  (0, 4)	1
  (0, 5)	1
  (0, 2)	1
  (0, 3)	1
  (0, 5)	1
  (0, 6)	1
  (0, 3)	1
  (0, 6)	1
  (0, 0)	1
  (0, 3)	1
  (0, 5)	1
  (0, 3)	1
  (0, 4)	1
  (0, 0)	1
  (0, 4)	1
  (0, 0)	1
  (0, 5)	1
  (0, 3)	1
  (0, 6)	1
  (0, 6)	1
  (0, 0)	1
  (0, 1)	1


In [115]:
p, signs = getSignature(matrix, n_sign=10000)

In [117]:
getJaccardSimilarity(signs,3,6)

0.5001

In [118]:
((np.array(matrix)[:,3]>0) & (np.array(matrix)[:,6]>0)).sum()/ ((np.array(matrix)[:,3]>0) | (np.array(matrix)[:,6]>0)).sum()

0.5

In [107]:
((np.array(matrix)[:,3]>0) & (np.array(matrix)[:,6]>0)).sum()

3

In [109]:
np.array(matrix)[:,3]

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 0])

In [110]:
np.array(matrix)[:,6]

array([0, 1, 1, 0, 0, 0, 0, 1, 1, 0])

In [5]:
import numpy as np
np.random.permutation([np.arange(10), np.arange(10)])

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])

In [24]:
rng.permuted(np.tile(np.arange(10),2).reshape(2,10), axis=0)

array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])