In [1]:
import pickle
import string
import time
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from functions import process_tweet,cosine_similarity

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vidit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vidit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
tweets = all_positive_tweets + all_negative_tweets

In [4]:
en_embeddings = pickle.load(open("en_embeddings.p", "rb"))
# fr_embeddings_subset = pickle.load(open("fr_embeddings.p", "rb"))

In [5]:
def get_tweet_embedding(tweet,en_embeddings):
    t_embedding = np.zeros(300)
    cleaned_tweet = process_tweet(tweet)
    for word in cleaned_tweet:
        t_embedding += en_embeddings.get(word,0)
    return t_embedding

In [6]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

tweet_embedding = get_tweet_embedding(custom_tweet, en_embeddings)
tweet_embedding[-5:]

array([-0.00268555, -0.15378189, -0.55761719, -0.07216644, -0.32263184])

In [7]:
def get_document_vecs(tweets,en_embedding):
    l = []
    d = {}
    for i,tweet in enumerate(tweets):
        d[i] = get_tweet_embedding(tweet,en_embedding)
        l.append(get_tweet_embedding(tweet,en_embedding))
    document_matrix = np.vstack(l)
    return document_matrix,d

In [8]:
document_vecs, idx_tweet = get_document_vecs(tweets, en_embeddings)

In [9]:
print(f"Length of dictionary {len(idx_tweet)}")
print(f"Shape of document_vecs {document_vecs.shape}")

Length of dictionary 10000
Shape of document_vecs (10000, 300)


In [10]:
my_tweet = 'i am happy'
process_tweet(my_tweet)
tweet_embedding = get_tweet_embedding(my_tweet, en_embeddings)

idx = np.argmax(cosine_similarity(document_vecs, tweet_embedding.reshape(300,1)))
print(tweets[idx])

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


  cos = dot / (norma * normb)


In [11]:
# Searching Closest Tweet using Locality Sensitive Hashsing
# Much faster than searching through all 10000 tweets

In [12]:
print(f"Number of vectors is {len(tweets)} and each has {len(idx_tweet[1])} dimensions.")

Number of vectors is 10000 and each has 300 dimensions.


In [13]:
# Each plane divides our space into 2 parts
# n-planes divides into 2^n parts/buckets
# We have 10000 vectors we want to divide them such that each bucket has 16 vectors
# Therefore 10000/16 = 625 buckets
# Number of planes is log2(625) which is close to 10

In [14]:
n_planes = 10
n_dim = 300
# Number of times to repeat the hashing to improve the search.
n_universes = 25

In [15]:
planes_l = [np.random.normal(size=(n_dim, n_planes)) for _ in range(n_universes)]

In [16]:
planes_l[0].shape

(300, 10)

In [17]:
def hash_value_of_vector(v, planes):
    sign_of_dot_product = np.sign(np.dot(v,planes))
    
    # set h to be false (eqivalent to 0 when used in operations) if the sign is negative,
    # and true (equivalent to 1) if the sign is positive (1,10) shaped vector
    h = sign_of_dot_product>=0 
    h = np.squeeze(h)
    
    hash_value = 0
    n_planes = planes.shape[1] # 10
    
    for i in range(n_planes):
        hash_value += np.power(2,i)*h[i]

    hash_value = int(hash_value)
    return hash_value

In [18]:
def make_hash_table(vecs, planes): # this is for a single set of 10 planes
    num_of_planes = planes.shape[1] # 10
    num_buckets = 2**num_of_planes

    hash_table = {i:[] for i in range(num_buckets)}
    id_table = {i:[] for i in range(num_buckets)}

    for i, v in enumerate(vecs):
        h = hash_value_of_vector(v,planes)
        hash_table[h].append(v)
        id_table[h].append(i)

    return hash_table, id_table

In [19]:
hash_tables = []
id_tables = []
for i in range(n_universes):  # there are 25 sets of 10 planes
    print(f"Working on {i+1} set")
    planes = planes_l[i]
    hash_table, id_table = make_hash_table(document_vecs, planes)
    hash_tables.append(hash_table)
    id_tables.append(id_table)

Working on 1 set
Working on 2 set
Working on 3 set
Working on 4 set
Working on 5 set
Working on 6 set
Working on 7 set
Working on 8 set
Working on 9 set
Working on 10 set
Working on 11 set
Working on 12 set
Working on 13 set
Working on 14 set
Working on 15 set
Working on 16 set
Working on 17 set
Working on 18 set
Working on 19 set
Working on 20 set
Working on 21 set
Working on 22 set
Working on 23 set
Working on 24 set
Working on 25 set


In [20]:
def nearest_neighbor(v, candidates, k=1):
    similarity_l = []
    for row in candidates:
        cos_similarity = cosine_similarity(v,row)
        similarity_l.append(cos_similarity)
        
    sorted_ids = np.argsort(similarity_l)
    k_idx = sorted_ids[-k:]
    return k_idx

In [21]:
def closest(tweet_id,tweets,en_embedding,planes,hash_tables,id_tables,k):
    tweet = tweets[tweet_id]
    doc_em = get_tweet_embedding(tweet,en_embedding)
    id_to_consider = []
    vecs_to_consider = []
    
    for i in range(len(planes)):
        h_v = hash_value_of_vector(doc_em,planes[i])
        hash_table = hash_tables[i] # hash table for corresponding set of planes
        doc_vecs = hash_table[h_v]
        id_table = id_tables[i]
        id_vecs = id_table[h_v]
        
        if tweet_id in id_vecs:
            id_vecs.remove(tweet_id)
        
        for i,new_id in enumerate(id_vecs):
            if new_id not in id_to_consider:
                id_to_consider.append(new_id)
                vecs_to_consider.append(tweets[new_id])
                
    vecs_to_consider_arr = np.array(vecs_to_consider)
    li,a = get_document_vecs(vecs_to_consider_arr,en_embedding)
    tweet_vec = get_tweet_embedding(tweet,en_embedding)
    idx = nearest_neighbor(tweet_vec,li.tolist(),k)
    final_ids = [id_to_consider[i] for i in idx]
    return final_ids

In [22]:
ids = closest(888,tweets,en_embeddings,planes_l,hash_tables,id_tables,3)
ids

[3128, 25, 9770]

In [23]:
print("Original Tweet: ",tweets[888])
print()
print("Similar Tweets:")
print("1) ",tweets[ids[0]])
print("2) ",tweets[ids[1]])
print("3) ",tweets[ids[2]])

Original Tweet:  Thanks for updating your profile page @AlexaPoppe :-)  http://t.co/JK3NSXIRMe

Similar Tweets:
1)  @triangledarren thank you :)
2)  @Bosslogic @amellywood @CW_Arrow @ARROWwriters Thank you! :-)
3)  @The5BallOver @Radio702 :-( It's not a challenge though. Please check our FB page for entries and rather do a substitution. Thanks!
