# Exact Cosine

In [1]:
import pandas as pd
import numpy as np
import re
import time
import math 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize

from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from gensim.utils import simple_preprocess
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
import nltk 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
import pickle
def pickle_store(obj, filename):
    with open(filename, 'wb') as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)
        
# angular similarity according to wikipidia (not used)
# http://en.wikipedia.org/wiki/Cosine_similarity
def angular_similarity(a,b):
    dot_prod = np.dot(a,b)
    sum_a = sum(a**2) **.5
    sum_b = sum(b**2) **.5
    cosine = dot_prod/sum_a/sum_b # cosine similarity
    theta = math.acos(cosine)
    return 1.0-(theta/math.pi)

## Document preprocesser

In [0]:
lmtzr = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:    
      return None

my_stopwords = ENGLISH_STOP_WORDS.union(stopwords.words('english'))\
    .union(['new', 'said', 'say','need', 'come', 'good', 'set', 'want', 'people', 'use', 'day', 'week', 'know'])

my_stopwords_lemma = set()
for word, nltk_tag in nltk.pos_tag(my_stopwords):
    tag = nltk2wn_tag(nltk_tag)
    if tag is not None:
        my_stopwords_lemma.add(lmtzr.lemmatize(word, tag))
    else:
        my_stopwords_lemma.add(word)
        

def documents_preprocess(documents):
    new_documents = []
    starting_tm = time.time()
    for doc in documents:
        clean_doc = []
        doc_tokens = simple_preprocess(doc, deacc=True)
        for word, nltk_tag in  nltk.pos_tag(doc_tokens):
            tag = nltk2wn_tag(nltk_tag)
            if tag is not None:
                lemma = lmtzr.lemmatize(word, tag)
                if lemma not in my_stopwords_lemma:
                    clean_doc.append(lemma)
            else:
                if word not in my_stopwords:
                    clean_doc.append(word)
        new_documents.append(' '.join(clean_doc))
    
    print("Text Preprocessing took: " + str(time.time() - starting_tm))
    return new_documents


## Load the dataset

In [8]:
# train = pd.read_csv("data/corpusTrain.csv")
# test = pd.read_csv("data/corpusTest.csv")

train = pd.read_csv("/content/drive/My Drive/corpusTrain.csv")
test = pd.read_csv("/content/drive/My Drive/corpusTest.csv")

# train=train[:10000]
# test=test[:1000]

print("Train: ", len(train), "Test: ", len(test))

Train:  531990 Test:  5374


## Clean the dataset

In [9]:
start = time.time()

clean_train = documents_preprocess(train['Content'])
clean_test = documents_preprocess(test['Content'])

print("Clean: ", time.time()-start)

Text Preprocessing took: 338.6784873008728
Text Preprocessing took: 3.4820220470428467
Clean:  342.16618371009827


In [0]:
# pickle_store(clean_train, "vars/full_clean_train")
# pickle_store(clean_test, "vars/full_clean_test")

print("done")

clean_train = pickle_load("vars/full_clean_train")
clean_test = pickle_load("vars/full_clean_test")

## Vectorize 
- hashingVectorizer
- CountVectorizer (Current in use)
- TfidfVectorizer

In [10]:

# vectorizer = HashingVectorizer(stop_words=my_stopwords, n_features=8000)
# vectorizer = TfidfVectorizer(stop_words=my_stopwords, max_features=50000)
vectorizer = CountVectorizer()

start = time.time()

vtrain = vectorizer.fit_transform(clean_train)
vtest = vectorizer.transform(clean_test)

print("Vectorization Time: ", time.time()-start)

Vectorization Time:  3.8326215744018555


In [0]:
pickle_store(vtrain, "vars/vtrain_countFull")
pickle_store(vtest, "vars/vtest_countFull")

# vtrain = pickle_load("vars/vtrain_tf2000")
# vtest = pickle_load("vars/vtest_tf2000")

In [11]:
print(vtest.shape)
print(vtrain.shape)

(5374, 70052)
(531990, 70052)


## Run Cosine

In [12]:
start = time.time()
duplicates = 0 

for vt in vtest:
    
    results = cosine_similarity(vt, vtrain)
    duplicates += (results > 0.8).sum()

q_time = time.time()- start

print("queue: ", q_time)


queue:  430.126925945282


In [13]:
duplicates

27087


---

---

## SVD (not used in final)

In [0]:
start = time.time()

svd = TruncatedSVD(n_components=100)
sv_train = svd.fit_transform(vtrain)
sv_test= svd.transform(vtest)

print("SVD: ", time.time()-start)

pickle_store(sv_train, "vars/sv_train_100")
pickle_store(sv_test, "vars/sv_test100")

# sv_train = pickle_load("vars/sv_train_100")
# sv_test = pickle_load("vars/sv_test100")

## Normalizer (not used)

In [0]:

minx = -1 
maxx = 1

normalize = lambda x : (x - minx)/(maxx-minx)
vfunc = np.vectorize(normalize)
results2 = vfunc(results)

(results2 >= 0.8).sum()