In [1]:
import os
import re
import nltk
import gensim
import scipy
import sklearn as sk
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from gensim.models import KeyedVectors

In [2]:
def get_dict(file_name):
    """
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
    Check out the files this function takes in your workspace.
    """
    my_file = pd.read_csv(file_name, delimiter=' ')
    etof = {}  # the english to french dictionary to be returned
    for i in range(len(my_file)):
        # indexing into the rows.
        en = my_file.loc[i][0]
        fr = my_file.loc[i][1]
        etof[en] = fr

    return etof

Download eng embedding from and [look for GoogleNews-vectors-negative300.bin.gz](https://code.google.com/archive/p/word2vec/)

For french embedding:
in the terminal, type (in one line) curl -o ./wiki.multi.fr.vec https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec


In [3]:
# Use this code to download and process the full dataset on your local computer

en_embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)
fr_embeddings = KeyedVectors.load_word2vec_format('./wiki.multi.fr.vec')

In [4]:


# get the positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
all_tweets = all_positive_tweets + all_negative_tweets

In [5]:
import string
# preprocess tweet
stemmer = PorterStemmer()
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

stopwords_eng = stopwords.words("english")
punctuations_eng = string.punctuation

def preprocess_tweet(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r"\$\w*", "", tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r"^RT[\s]+", "", tweet)
    
    # remove hyperlinks
    tweet = re.sub(r"https?:\/\/[.a-zA-Z\/-]*[\r\n]*", "", tweet)
    
    # remove the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    tweet_tokens = tokenizer.tokenize(tweet)
    
    tweet_tokens_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_eng and word not in punctuations_eng):
            stemmed_word = stemmer.stem(word)
            
            tweet_tokens_clean.append(stemmed_word)
    
    return tweet_tokens_clean

def get_doc_embedding(tweet, en_embeddings):
    doc_embedding = np.zeros(300)
    
    preprocessed_tweet = preprocess_tweet(tweet)
    
    for word in preprocessed_tweet:
        if word in en_embeddings:
            doc_embedding += en_embeddings[word]
        
    return doc_embedding
    

# testing your function
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

tweet_embedding = get_doc_embedding(custom_tweet, en_embeddings)
tweet_embedding[-5:]

array([-0.4440918 , -0.53891373, -0.87011719, -0.12850189,  0.14611816])

In [6]:
def get_idx_to_embedding_dict_and_embedding_matrix(all_tweets, en_embeddings):
    embedding_matrix = []
    idx_to_embedding = {}
    for idx, tweet in enumerate(all_tweets):
        doc_embedding = get_doc_embedding(tweet, en_embeddings)
        embedding_matrix.append(doc_embedding)
        idx_to_embedding[idx] = doc_embedding
        
    embedding_matrix = np.vstack(embedding_matrix)
    
    return embedding_matrix, idx_to_embedding

def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [7]:
embedding_matrix, idx_to_embedding = get_idx_to_embedding_dict_and_embedding_matrix(all_tweets, en_embeddings)

In [9]:
print(f"length of dictionary {len(idx_to_embedding)}")
print(f"shape of document_vecs {embedding_matrix.shape}")

length of dictionary 10000
shape of document_vecs (10000, 300)


In [15]:
my_tweet = 'i am sad'
tweet_embedding = get_doc_embedding(my_tweet, en_embeddings)

idx = np.argmax(cosine_similarity(embedding_matrix, tweet_embedding))
print(idx)
print(all_tweets[idx])

9838
being sad for no reason sucks because u dunno how to stop being sad so u just gotta chill in ur room and listen to music &amp; b alone :(


In [16]:
NUM_VECS, EMBEDDING_DIM = embedding_matrix.shape
NUM_VECS, EMBEDDING_DIM

(10000, 300)

In [22]:
def get_hash_value(vec, planes):
    """Create a hash for a vector; hash_id says which random hash to use.
    Input:
        - v:  vector of tweet. It's dimension is (1, N_DIMS)
        - planes: matrix of dimension (N_DIMS, N_PLANES) - the set of planes that divide up the region
    Output:
        - res: a number which is used as a hash for your vector
        
    """
    
    dot_product = np.dot(vec, planes)
    
    # get the sign of the dot product (1,10) shaped vector
    sign_of_dot_product = np.sign(dot_product)
    
    
    # set h to be false (eqivalent to 0 when used in operations) if the sign is negative,
    # and true (equivalent to 1) if the sign is positive (1,10) shaped vector
    h = sign_of_dot_product>=0
    
    # remove extra un-used dimensions (convert this from a 2D to a 1D array)
    h = np.squeeze(h)
    
    hash_value = 0
    
    n_planes = planes.shape[1]
    for i in range(n_planes):
        hash_value += np.power(2,i)*h[i]
    
    hash_value = int(hash_value)

    return hash_value


# The number of planes. We use log2(625) to have ~16 vectors/bucket.
N_PLANES = 10
# Number of times to repeat the hashing to improve the search.
N_UNIVERSES = 25

np.random.seed(0)
planes_l = [np.random.normal(size=(EMBEDDING_DIM, N_PLANES))
            for _ in range(N_UNIVERSES)]
vec = np.random.rand(1, 300)
hash_value = get_hash_value(vec, planes_l[0])    

In [65]:
def make_hash_table(vectors, planes):
    """
    Input:
        - vecs: list of vectors to be hashed.
        - planes: the matrix of planes in a single "universe", with shape (embedding dimensions, number of planes).
    Output:
        - hash_table: dictionary - keys are hashes, values are lists of vectors (hash buckets)
        - id_table: dictionary - keys are hashes, values are list of vectors id's
                            (it's used to know which tweet corresponds to the hashed vector)
    """
    
    # number of planes is the number of columns in the planes matrix
    num_of_planes = planes.shape[1]

    # number of buckets is 2^(number of planes)
    num_buckets = 2**num_of_planes
    
    hash_table = {idx: [] for idx in range(num_buckets)}
    id_table = {idx: [] for idx in range(num_buckets)}

    # for each vector in 'vecs'
    for idx, vec in enumerate(vectors):
        # calculate the hash value for the vector
        h = get_hash_value(vec, planes)

        hash_table[h].append(vec)
        id_table[h].append(idx)

    ### END CODE HERE ###

    return hash_table, id_table

In [66]:
# Grnerate table
hash_tables = []
id_tables = []
for universe_id in range(N_UNIVERSES):
    planes = planes_l[universe_id]
    hash_table, id_table = make_hash_table(embedding_matrix, planes)
    
    hash_tables.append(hash_table)
    id_tables.append(id_table)

In [69]:
def k_nearest_neighbours(v, candidates, k=1):
    
    neighbours = []
    for candidate in candidates:
        sim = cosine_similarity(v, candidate)
        neighbours.append(sim)
        
    sorted_idxs = np.argsort(neighbours)
    
    return sorted_idxs[-k:]

def approx_knn(vec, planes_l, num_universes, k=1):
    candidate_vectors = []
    candidate_ids = []
    candidate_id_set = set()
    
    for u_id in range(num_universes):
        planes = planes_l[u_id]
        
        hash_table = hash_tables[universe_id]
        id_table = id_tables[universe_id]
        
        hash_value = get_hash_value(vec, planes)
        
        # doc vectors for this universe id
        doc_vectors = hash_table[hash_value]
        doc_ids = id_table[hash_value]
        
        for idx, doc_id in enumerate(doc_ids):
            
            if doc_id not in candidate_id_set:
#                 print(idx, doc_id)
                
                candidate_vectors.append(doc_vectors[idx])
                candidate_ids.append(doc_id)
                
                candidate_id_set.add(doc_id)
                
    candidate_vectors = np.array(candidate_vectors)
    nearest_neighbor_ids = k_nearest_neighbours(vec, candidate_vectors, k=k)
    print(nearest_neighbor_ids)

    nearest_neighbor_ids = [candidate_ids[idx]
                        for idx in nearest_neighbor_ids]

    return nearest_neighbor_ids

        

vec = get_doc_embedding("I am sad", en_embeddings)
approx_knn(vec, planes_l, 25, k=3)

[177 192 194]


[6595, 8398, 8588]

In [70]:
all_tweets[8588]

"Nobodies up with me now, I'm sad :("