# Exact Jaccard

In [0]:
import pandas as pd
import numpy as np
import re
import time

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
import nltk 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Document preprocesser

In [0]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from gensim.utils import simple_preprocess

lmtzr = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:    
      return None

my_stopwords = ENGLISH_STOP_WORDS.union(stopwords.words('english'))\
    .union(['new', 'said', 'say','need', 'come', 'good', 'set', 'want', 'people', 'use', 'day', 'week', 'know'])

my_stopwords_lemma = set()
for word, nltk_tag in nltk.pos_tag(my_stopwords):
    tag = nltk2wn_tag(nltk_tag)
    if tag is not None:
        my_stopwords_lemma.add(lmtzr.lemmatize(word, tag))
    else:
        my_stopwords_lemma.add(word)
        

def documents_preprocess(documents):
    new_documents = []
    starting_tm = time.time()
    for doc in documents:
        clean_doc = []
        doc_tokens = simple_preprocess(doc, deacc=True)
        for word, nltk_tag in  nltk.pos_tag(doc_tokens):
            tag = nltk2wn_tag(nltk_tag)
            if tag is not None:
                lemma = lmtzr.lemmatize(word, tag)
                if lemma not in my_stopwords_lemma:
                    clean_doc.append(lemma)
            else:
                if word not in my_stopwords_lemma:
                    clean_doc.append(word)
        new_documents.append(' '.join(clean_doc))
    
    print("Text Preprocessing took: " + str(time.time() - starting_tm))
    return new_documents

def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)    
    return float(len(c)) / (len(a) + len(b) - len(c))

def simple_jaccard(a, b):
    c = a.intersection(b)    
    return float(len(c)) / (len(a) + len(b) - len(c))

import pickle
def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

## Load the dataset

In [6]:
train = pd.read_csv("data/corpusTrain.csv")
test = pd.read_csv("data/corpusTest.csv")

# train = pd.read_csv("/content/drive/My Drive/corpusTrain.csv")
# test = pd.read_csv("/content/drive/My Drive/corpusTest.csv")

#train=train[:10000]
#test=test[:10000]
print("Train: ", len(train), "Test: ", len(test))

train.head()

Train:  531990 Test:  5374


Unnamed: 0,Id,Content
0,0,How many people are going towards using phones...
1,1,What audio format should I use for getting aud...
2,2,What is the corporate culture like at Edwards ...
3,3,What is the best barbecue in Kansas City?\n
4,4,"""Can I combine the output of two bolts to one ..."


## Clean the dataset

In [7]:
clean_train = documents_preprocess(train['Content'])
clean_test = documents_preprocess(test['Content'])
clean_test[0]

Text Preprocessing took: 368.04343342781067
Text Preprocessing took: 3.706369400024414


'mark college'

In [0]:
clean_train = pickle_load("clean_train")
clean_test = pickle_load("clean_test")

## Compare with Jaccard

In [2]:
duplicates = 0
t1 = time.time()

for index,y in enumerate(clean_test):

    ti = time.time()
    A = set(y.split())
    if( len(A) == 0): continue
        
    for x in clean_train:
        
        B = set(x.split())
        if( len(B) == 0 ): continue
        
        results = simple_jaccard(A, B)

        if results > 0.8:
            duplicates += 1

    itertime = time.time() - ti
    #print("ITER: ", index, " from " , len(test), " took ", itertime, " Duplicates: ", duplicates)

duration = time.time() - t1




## Results

In [10]:
# print("Duration (s): ", duration)
print("Duplicates: ", duplicates)

Duplicates:  5644


Duration (s):  5462.53768324852  Duplicates:  5644

---