<a href="https://colab.research.google.com/github/thedatadj/natural-language-processing/blob/main/document_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Datasets

In [3]:
import nltk
from nltk.corpus import twitter_samples
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

Load set of positive tweets and negative tweets stored in a list.

In [5]:
# Positive
posset = twitter_samples.strings('positive_tweets.json')

# Negative
negset = twitter_samples.strings('negative_tweets.json')

# All tweets
fullset = posset + negset

# Embeddings
Load and embedding dictionary where the key is the word, and the value is the embedding vector of that word.
* Each embedding is a 300-dimensional vector.

In [8]:
# Download file
!gdown 16TjH95jhGzqA8f0zSf0uOrAYM_E0aYtT

Downloading...
From: https://drive.google.com/uc?id=16TjH95jhGzqA8f0zSf0uOrAYM_E0aYtT
To: /content/en_embeddings.p
  0% 0.00/8.12M [00:00<?, ?B/s] 58% 4.72M/8.12M [00:00<00:00, 31.1MB/s]100% 8.12M/8.12M [00:00<00:00, 49.0MB/s]


In [9]:
# Load file
import pickle
en_embeddings_subset = pickle.load(open("/content/en_embeddings.p", "rb"))

First 10 components of the embedding vector of the word "the".

In [10]:
en_embeddings_subset['the'][:10]

array([ 0.08007812,  0.10498047,  0.04980469,  0.0534668 , -0.06738281,
       -0.12060547,  0.03515625, -0.11865234,  0.04394531,  0.03015137],
      dtype=float32)

# Tweet to embedding
Function that transform a tweet into a vector representation of that tweet.
* Using the vector representation of each word in the tweet.

In [22]:
# Function to preprocess the tweet
!gdown 1vi5cAPha1-n1OKx9ke_xhkoVQQYE2hDu
from utils import process_tweet
from nltk.corpus import stopwords
nltk.download('stopwords')

Downloading...
From: https://drive.google.com/uc?id=1vi5cAPha1-n1OKx9ke_xhkoVQQYE2hDu
To: /content/utils.py
  0% 0.00/2.67k [00:00<?, ?B/s]100% 2.67k/2.67k [00:00<00:00, 9.23MB/s]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
import numpy as np

In [26]:
def tweetembedding(tweet, embeddings):
    # Initialize the tweet embedding
    tweetembedding0 = np.zeros(300)

    # Process the tweet
    tokens = process_tweet(tweet)

    # Accumulate embeddings values
    for token in tokens:
        tweetembedding0 += embeddings.get(token, 0)

    return tweetembedding0

In [28]:
# Demostration
tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
embedding0 = tweetembedding(tweet, en_embeddings_subset)
embedding0[:5]

array([-0.09228516,  0.35986328, -0.0206604 ,  0.63085938, -0.06640625])

# Embedding dataset
Get a dictionary and a numpy array containing the embeddings of all the tweets in the `fullset` dataset.

In [38]:
embeddingdic = {}
embeddingset = []
for i, tweet in enumerate(fullset):
    embedding = tweetembedding(tweet, en_embeddings_subset)
    embeddingdic[i] = embedding
    embeddingset.append(embedding)

embeddingset = np.array(embeddingset)

# Similar tweet
Given a tweet, search another similar tweet in the dataset.

In [73]:
tweet1 = 'i am sad'

In [45]:
from utils import cosine_similarity

In [74]:
# Vector representation of the tweet
embedding1 = tweetembedding(tweet1, en_embeddings_subset)

# Get similarity with each tweet in the dataset
similarities = cosine_similarity(embeddingset, embedding1)

In [56]:
np.argmax(similarities)

5202

In [76]:
fullset[5202]

'@hanbined sad pray for me :((('

# Similar tweet with LSH
Same as before but using locality sensitive hashing.

In [77]:
# Number of planes
n = 10

# Iterations
iterations = 25

## Random planes
Create a set containing:
* 25 set of 10 planes.

In [97]:
np.random.seed(0)
m = 300
def tenplanes():
    planes = np.random.normal(size=(m, n))
    return planes

planessets = [tenplanes() for i in range(iterations)]

## Hash function
Function that:
* Outputs the hash value of a vector.

In [98]:
def hashvalue(vector, planes):
    dot = vector.dot(planes)
    signs = np.sign(dot)
    localhash = (signs >= 0).astype(int)
    localhash = localhash.flatten()

    globalhash = 0

    for i in range(n):
        globalhash += 2**i * localhash[i]

    globalhash = int(globalhash)
    return globalhash

## Hash table

In [100]:
def hashtablem(vectors, planes):
    # Number of planes
    n = 10

    # Number of hash values
    nh = 2**n

    # Initialize hash table and id table
    hashtable = {}
    idtable = {}
    for i in range(nh):
        hashtable[i] = []
        idtable[i] = []

    for i, vector in enumerate(embeddingset):
        ihashvalue = hashvalue(vector, planes)
        hashtable[ihashvalue].append(vector)
        idtable[ihashvalue].append(i)

    return hashtable, idtable

# Hash table per iteration
Create a hash table per each iteration.

In [110]:
def hashtable25():
    hashtables = []
    idtables = []
    for i in range(iterations):
        planes = planessets[i]
        hashtable, idtable = hashtablem(embeddingset, planes)
        hashtables.append(hashtable)
        idtables.append(idtable)
    return hashtables, idtables

hashtables, idtables = hashtable25()

# K nearest neighbors

In [111]:
def nearest_neighbor(v, candidates, k=1, cosine_similarity=cosine_similarity):
    """
    Input:
      - v, the vector you are going find the nearest neighbor for
      - candidates: a set of vectors where we will find the neighbors
      - k: top k nearest neighbors to find
    Output:
      - k_idx: the indices of the top k closest vectors in sorted form
    """
    ### START CODE HERE ###
    similarity_l = []

    # for each candidate vector...
    for row in candidates:
        # get the cosine similarity
        cos_similarity = cosine_similarity(v, row)

        # append the similarity to the list
        similarity_l.append(cos_similarity)

    # sort the similarity list and get the indices of the sorted list
    sorted_ids = np.argsort(similarity_l)

    # Reverse the order of the sorted_ids array
    sorted_ids = np.flip(sorted_ids)

    # get the indices of the k most similar candidate vectors
    k_idx = sorted_ids[:k]
    ### END CODE HERE ###
    return k_idx

In [119]:
def knn(index, vector, planeset, hashtables, idtables, k=1, iterations=25):
    candidates = []
    ids = []
    ids_canditates = set()

    for i in range(iterations):
        planes = planeset[i]
        ihashvalue = hashvalue(vector, planes)
        hashtable = hashtables[i]
        embeddings = hashtable[ihashvalue]
        idtable = idtables[i]
        newidcandidates = idtable[ihashvalue]

        for i, new_id in enumerate(newidcandidates):
            if index == new_id:
                continue

            if new_id not in ids_canditates:
                ithembedding = embeddings[i]
                candidates.append(ithembedding)
                ids.append(new_id)
                ids_canditates.add(new_id)

    vectors_candidate = np.array(candidates)
    nearest = nearest_neighbor(vector, vectors_candidate, k=k)
    nnids = [ids[idx] for idx in nearest]

    return nnids

In [120]:
doc_id = 0
doc_to_search = fullset[doc_id]
vec_to_search = embeddingset[doc_id]

In [121]:
nearest_neighbor_ids = knn(
    doc_id, vec_to_search, planessets, hashtables, idtables, k=3, iterations=5)

In [122]:
nearest_neighbor_ids

[51, 2478, 105]

In [123]:
print(f"Nearest neighbors for document {doc_id}")
print(f"Document contents: {doc_to_search}")
print("")

for neighbor_id in nearest_neighbor_ids:
    print(f"Nearest neighbor at document id {neighbor_id}")
    print(f"document contents: {fullset[neighbor_id]}")

Nearest neighbors for document 0
Document contents: #FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Nearest neighbor at document id 51
document contents: #FollowFriday @France_Espana @reglisse_menthe @CCI_inter for being top engaged members in my community this week :)
Nearest neighbor at document id 2478
document contents: #ShareTheLove @oymgroup @musicartisthere for being top HighValue members this week :) @nataliavas http://t.co/IWSDMtcayt
Nearest neighbor at document id 105
document contents: #FollowFriday @straz_das @DCarsonCPA @GH813600 for being top engaged members in my community this week :)
