# MinHash + LSH

In [23]:
import pandas as pd
import numpy as np
import re
import time
import pickle

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer 
from datasketch import MinHash, MinHashLSH
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from gensim.utils import simple_preprocess

In [24]:
def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

## Document preprocesser

In [25]:
lmtzr = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:    
      return None

my_stopwords = ENGLISH_STOP_WORDS.union(stopwords.words('english'))\
    .union(['new', 'said', 'say','need', 'come', 'good', 'set', 'want', 'people', 'use', 'day', 'week', 'know'])

my_stopwords_lemma = set()
for word, nltk_tag in nltk.pos_tag(my_stopwords):
    tag = nltk2wn_tag(nltk_tag)
    if tag is not None:
        my_stopwords_lemma.add(lmtzr.lemmatize(word, tag))
    else:
        my_stopwords_lemma.add(word)
        

def documents_preprocess(documents):
    new_documents = []
    starting_tm = time.time()
    for doc in documents:
        clean_doc = []
        doc_tokens = simple_preprocess(doc, deacc=True)
        for word, nltk_tag in  nltk.pos_tag(doc_tokens):
            tag = nltk2wn_tag(nltk_tag)
            if tag is not None:
                lemma = lmtzr.lemmatize(word, tag)
                if lemma not in my_stopwords_lemma:
                    clean_doc.append(lemma)
            else:
                if word not in my_stopwords_lemma:
                    clean_doc.append(word)
        new_documents.append(' '.join(clean_doc))
    
    print("Text Preprocessing took: " + str(time.time() - starting_tm))
    return new_documents

In [26]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)    
    return float(len(c)) / (len(a) + len(b) - len(c))

def simple_jacard(a, b):
    c = a.intersection(b)    
    return float(len(c)) / (len(a) + len(b) - len(c))

## Building the minHash LSH function

In [33]:
perm = 32

In [29]:
def build_minHash_lsh(dataset):
    build_duration = time.time()
    ms=[]
    for quest in dataset:
        m = MinHash(num_perm=perm)
        for q in set(quest.split()): #stem_tokenization
            m.update(q.encode('utf8'))
        ms.append(m)
    # Create LSH index
    lsh = MinHashLSH(threshold=0.8, num_perm=perm)
    for i,mi in enumerate(ms):
        lsh.insert(str(i), mi)
    print("Build Duration: ", time.time()- build_duration)
    return lsh

---

In [4]:
train = pd.read_csv("data/corpusTrain.csv")
test = pd.read_csv("data/corpusTest.csv")

#train=train[:10000]
#test=test[:10000]

print("Train: ", len(train), "Test: ", len(test))

train.head()

Train:  531990 Test:  5374


Unnamed: 0,Id,Content
0,0,How many people are going towards using phones...
1,1,What audio format should I use for getting aud...
2,2,What is the corporate culture like at Edwards ...
3,3,What is the best barbecue in Kansas City?\n
4,4,"""Can I combine the output of two bolts to one ..."


## Clean the Dataset

In [5]:
clean_train = documents_preprocess(train['Content'])
clean_test = documents_preprocess(test['Content'])
clean_test[0]

Text Preprocessing took: 262.40555596351624
Text Preprocessing took: 2.6604554653167725


'mark college'

In [30]:
clean_train = pickle_load("vars/full_clean_train")
clean_test = pickle_load("vars/full_clean_test")

## Build LSH

In [34]:
lsh = build_minHash_lsh(clean_train)

# lsh = pickle_load("minHash.lsh")

Build Duration:  85.43168044090271


## Detect Duplicates

In [35]:
queury_duration = time.time()
duplicates = 0

for testq in clean_test:
    mtest = MinHash(num_perm=perm)
    for t in set(testq.split()):
        mtest.update(t.encode('utf8'))
    result = lsh.query(mtest)
    if len(result) > 0:
        duplicates += len(result)

print("\nQueury Duration: ", time.time()- queury_duration)
print("Duplicates: ", duplicates)


Queury Duration:  0.8945426940917969
Duplicates:  12529


# Results

## Perms: 16
- Build Duration:  61.35 seconds
- Queury Duration:  0.651 seconds
- Duplicates: 13.984

## Perms: 32
- Build Duration:  86.38 seconds
- Queury Duration:  0.887 seconds
- Duplicates: 12.529

## Perms: 64
- Build Duration:   131.37 seconds
- Queury Duration:   1.32 seconds
- Duplicates: 12771
---


## Clean Document

- train Text Preprocessing took: 262.40555596351624
- text Text Preprocessing took: 2.6604554653167725

---

In [7]:
# store the lsh 
pickle_store(lsh,"minHash.lsh")

In [None]:
# load the lsh
lsh = pickle_load("minHash.lsh")