# Random projection + LSH

In [2]:
import pandas as pd
import numpy as np
import re
import time
import math

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import vstack

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
import pickle
def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

## Document preprocesser

In [3]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from gensim.utils import simple_preprocess

lmtzr = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:    
      return None

my_stopwords = ENGLISH_STOP_WORDS.union(stopwords.words('english'))\
    .union(['new', 'said', 'say','need', 'come', 'good', 'set', 'want', 'people', 'use', 'day', 'week', 'know'])

my_stopwords_lemma = set()
for word, nltk_tag in nltk.pos_tag(my_stopwords):
    tag = nltk2wn_tag(nltk_tag)
    if tag is not None:
        my_stopwords_lemma.add(lmtzr.lemmatize(word, tag))
    else:
        my_stopwords_lemma.add(word)
        

def documents_preprocess(documents):
    new_documents = []
    starting_tm = time.time()
    for doc in documents:
        clean_doc = []
        doc_tokens = simple_preprocess(doc, deacc=True)
        for word, nltk_tag in  nltk.pos_tag(doc_tokens):
            tag = nltk2wn_tag(nltk_tag)
            if tag is not None:
                lemma = lmtzr.lemmatize(word, tag)
                if lemma not in my_stopwords_lemma:
                    clean_doc.append(lemma)
            else:
                if word not in my_stopwords_lemma:
                    clean_doc.append(word)
        new_documents.append(' '.join(clean_doc))
    
    print("Text Preprocessing took: " + str(time.time() - starting_tm))
    return new_documents

## Load the dataset

In [4]:
train = pd.read_csv("data/corpusTrain.csv")
test = pd.read_csv("data/corpusTest.csv")

# train=train[:10000]
# test=test[:1000]

print("Train: ", len(train), "Test: ", len(test))

Train:  531990 Test:  5374


## Clean the dataset

In [5]:
start = time.time()

clean_train = documents_preprocess(train['Content'])
clean_test = documents_preprocess(test['Content'])
clean_test[0]

clean_data = time.time() - start

Text Preprocessing took: 237.16527128219604
Text Preprocessing took: 2.411177158355713


In [8]:
# pickle_store(clean_train, "vars/full_clean_train")
# pickle_store(clean_test, "vars/full_clean_test")

print("done")

clean_train = pickle_load("vars/full_clean_train")
clean_test = pickle_load("vars/full_clean_test")

done


## Vectorize

In [9]:
vectorizer = CountVectorizer()

start = time.time()

vtrain = vectorizer.fit_transform(clean_train)
vtest = vectorizer.transform(clean_test)

print("Vectorization Time: ", time.time()-start)

Vectorization Time:  3.1966867446899414


In [4]:
# pickle_store(vtrain, "vars/vtrain_countFull")
# pickle_store(vtest, "vars/vtest_countFull")

vtrain = pickle_load("vars/vtrain_countFull")
vtest = pickle_load("vars/vtest_countFull")

# Random Projection


In [5]:
def get_signature(given_vector, rand_proj): 
    res = 0
    for p in (rand_proj):
        res = res << 1
        val = np.dot(p, given_vector)
        if val >= 0:
            res |= 1
    return res

# Hash Table: {"HashCode":"sparse matrix with the vectors"}
def build_hash_table(vtrain, dim, k, randv):
    hash_table = {}

    for v in vtrain:

        r = get_signature(v.toarray()[0], randv)
        if r in hash_table:
            hash_table[r] = vstack((hash_table[r], v))
        else:
            hash_table[r] = v
    return hash_table

# K: 1      v: 10, 9, 8, 7, 6, 5, 4, 3, 2, 1

In [41]:
k = 3

## Build the hash table

In [42]:
# get the dimention
d = len(vectorizer.get_feature_names())

# get the random dividors
randv = np.random.randn(k, d)

# get the hash table with LSH
hash_table = build_hash_table(vtrain, d, k, randv)

build_time = time.time() - start
print("Built: ", build_time, " Hash: ", len(hash_table))


## Queury

In [None]:
start = time.time()
duplicates = 0 

for vt in vtest:

    r = get_signature( vt.toarray()[0], randv)
    
    if r in hash_table:
        results = cosine_similarity(vt, hash_table[r])
        duplicates += (results > 0.8).sum()

q_time = time.time()- start
print("queue: ", q_time)

In [None]:
duplicates

---

### The end