# FINAL PROJECT PART 3

#### Vladimir Trukhaev & Ingrid Sancho

In [1]:
#imports 
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import json
import collections
from collections import Counter
from collections import defaultdict
from array import array
import math
import numpy as np
from numpy import linalg as la
import time

In [2]:
#updating/downloading stop words
nltk.download('stopwords')
#reading data
doc = "dataset_tweets_WHO.txt"
with open(doc, 'r') as file:
    data = json.load(file)
    
#initializing dictionary "my_dict" where value is the tweet text and key its id
keylist = []
for key in data:
    keylist.append(key)

my_dict = {}
docs_info = {}

for i in keylist:
    my_dict[i] = None
    docs_info[i] = None
    
for key in data:
    #initializing my_dict
    tweet = []
    for i in data[key]["full_text"]:
        tweet.append(i)
    tweet1 = "".join(tweet)
    my_dict[key] = tweet1

    #creting docs_info
    tweet = data[key]["full_text"]
    username = data[key]["user"]["name"]
    date = data[key]["created_at"]
    hashtags = data[key]["entities"]["hashtags"]
    likes = data[key]["favorite_count"]
    retweets = data[key]["retweet_count"]
    try:
        url = data[key]["entities"]["media"][0]["expanded_url"]
    except: #sometimes we weren't able to find the url in the data, then:
        url = "https://twitter.com/WHO/status/%s" % (data[key]["id_str"])
    
    info = {"tweet": tweet, "username": username, 
            "date": date, "hashtags": hashtags, 
            "likes": likes,"retweets": retweets, "url": url}
    docs_info[key] = info
    
 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vladi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Parts 1 and 2: Text Processing and Indexing

In [3]:
def lowering(d):
    """
    Transforming tweet text (values in dictionary) in lowercase
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with tweets transformed to lowercase as values
    """
    for key in d:
        d[key] = d[key].lower()
    return d

def cleaning(d):
    """
    Removing anything that is not alphanumeric
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with tweets without any non alphanumeric character
    """
    for key in d:
        d[key] = ["".join(re.sub(r'[^A-Za-z0-9 #]', ' ', i) for i in d[key])]
    return d

def tokenize(d):
    """
    Tokenizing the tweets, in other words, splitting text by "words"
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values
    """
    for key in d:
        for sentence in d[key]:
            d[key] = sentence.split()
    return d

def stpwords(d):
    """
    Removing stopwords, which are very common words that do not contain meaning
    
    Argument:
    d -- dictionary where list of words of tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values, now with no stopwords
    """
    languages = ["english", "spanish", "french"]
    for language in languages:
        stop_words = set(stopwords.words(language))
        for key in my_dict:
            my_dict[key] = [word for word in my_dict[key] if word not in stop_words]
    return d

def stemming(d):
    """
    Stemming tweets, which means to keep only the "root" of each word
    
    Argument:
    d -- dictionary where list of words of tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values, now stemmed words
    """
    stemmer = PorterStemmer()
    for key in my_dict.keys():
        my_dict[key] = [stemmer.stem(word) for word in my_dict[key]]
    return d

def build_terms(query): #used to process query text
    """
    Preprocess the input query removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    query -- string (text) to be preprocessed
    
    Returns:
    query - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    query=  query.lower() ## Transform in lowercase
    query=  query.split() ## Tokenize the text to get a list of terms
    query=[word for word in query if not word in stop_words]  ##eliminate the stopwords
    query=[stemmer.stem(word) for word in query] ## perform stemming
    
    return query

In [4]:
#running every function for our dictionary of tweets "my_dict"
my_dict = lowering(my_dict)
my_dict = cleaning(my_dict)
my_dict = tokenize(my_dict)
my_dict = stpwords(my_dict)
my_dict = stemming(my_dict)

In [5]:
def create_index_tfidf(my_dict, num_documents):
    """
    Implement the inverted index and compute tf, df and idf
    
    Argument:
    my_dict -- collection of Wikipedia articles
    num_documents -- total number of documents
    
    Returns:
    index - the inverted index (implemented through a Pyhon dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(list)
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    idf = defaultdict(float)
    for doc in my_dict:
        current_tweet_index = {}
        for position, term in enumerate(my_dict[doc]): # terms contains page_title + page_text. Loop over all terms
            try:
                  # if the term is already in the index for the current page (current_tweet_index)
                  # append the position to the corresponding list
                   current_tweet_index[term][1].append(position)  
            except:
                  # Add the new term as dict key and initialize the array of positions and add the position
                    current_tweet_index[term]=[doc, array('I',[position])] #'I' indicates unsigned int (int in Python)
              
        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_tweet_index.items():
            # posting will contain the list of positions for current term in current document. 
            # posting ==> [current_doc, [list of positions]] 
            # you can use it to infer the frequency of current term.
            norm += len(posting) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_tweet_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting)/norm,4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] = tf[term] # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_tweet_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = np.round(np.log(float(num_documents/len(df[term]))), 4)

    return index, tf, df, idf



def rank_documents(terms, docs, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    
    Returns:
    Print the list of ranked documents
    """

    # I'm interested only on the element of the docVector corresponding to the query terms 
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue
  
        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] =  query_terms_count[term]/len(terms)*idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):            
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  # TODO: check if multiply for idf

    # Calculate the score of each doc 
    # compute the cosine similarity between queyVector and each docVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    #print document titles instead if document id's
    #result_docs=[ title_index[x] for x in result_docs ]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)
    return result_docs

def search_tf_idf(query, index):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs=[posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs = rank_documents(query, docs, index, idf, tf)
    return ranked_docs

def rank_query(query):
    ranked_docs = search_tf_idf(query, index)
    top = 10

    print("\n======================\nTop {} results out of {} for".format(top, len(ranked_docs)), query,":\n")
    for d_id in ranked_docs[:top]:
        info_list = ["tweet","username","date","hashtags","likes","retweets","url"]
        print("DOC", d_id, "has been retrieved")
        for i in info:
            print(i,":",docs_info[d_id][i])
        print("\n")

# Part 3 TF-IDF with Cosine Similarity

In [6]:
start_time = time.time()
num_documents = len(my_dict)
index, tf, df, idf = create_index_tfidf(my_dict, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

rank_query("Covid España ola") 
#rank_query("how many covid cases")
#rank_query("mortalidad covid")
#rank_query("Covid prevention")
#rank_query("Pandemia mundial")

Total time to create the index: 111.38 seconds

Top 10 results out of 38 for Covid España ola :

DOC 2257 has been retrieved
tweet : Q&amp;A #AskWHO on COVID-19 vaccines effectiveness https://t.co/FEdfOREhjn
username : World Health Organization (WHO)
date : Wed Jun 30 16:12:43 +0000 2021
hashtags : [{'text': 'AskWHO', 'indices': [8, 15]}]
likes : 219
retweets : 85
url : https://twitter.com/WHO/status/1410270080873598979


DOC 61 has been retrieved
tweet : RT @WHOPhilippines: Vaccines can’t stop #COVID19 alone, but by doing it all we can help protect ourselves and our loved ones against COVID-…
username : World Health Organization (WHO)
date : Mon Oct 11 04:39:10 +0000 2021
hashtags : [{'text': 'COVID19', 'indices': [40, 48]}]
likes : 0
retweets : 71
url : https://twitter.com/WHO/status/1447421491428143106


DOC 1959 has been retrieved
tweet : 💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉                 💉💉💉💉
💉💉💉💉                 💉💉💉💉

COVID-19 vaccines     COVID-19 vaccines
in 10 countries             in t

# Part 3 Our Score with Cosine Similarity

In [7]:
#Calculation total likes and total retweets to normalize values later
total_likes = 0
total_retweets = 0

for i in my_dict:
    total_likes += docs_info["0"]["likes"]
    total_retweets += docs_info["0"]["retweets"]
    
print(total_likes)
print(total_retweets)

124748
38384


In [8]:
def create_our_score(my_dict, num_documents):
    """
    Implement the inverted index and compute tweet_score
    
    Argument:
    my_dict -- collection of Wikipedia articles
    num_documents -- total number of documents
    
    Returns:
    index - the inverted index (implemented through a Pyhon dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tweet_scores - score of each tweet depending on popularity
    """

    index = defaultdict(list)
    tweet_score =  {}
    
    for doc in my_dict:
        current_tweet_index = {}
        for position, term in enumerate(my_dict[doc]): # terms contains page_title + page_text. Loop over all terms
            try:
                  # if the term is already in the index for the current page (current_tweet_index)
                  # append the position to the corresponding list
                   current_tweet_index[term][1].append(position)  
            except:
                  # Add the new term as dict key and initialize the array of positions and add the position
                    current_tweet_index[term]=[doc, array('I',[position])] #'I' indicates unsigned int (int in Python)
 
       
        score = {}
        for term, posting in current_tweet_index.items():
            # Compute tweet score, depending on the importance/popularity of the tweet
            likes = docs_info[doc]["likes"]
            retweets = docs_info[doc]["retweets"]
            factor = likes/total_likes + retweets / total_retweets
            score[term] = 100 * np.round(factor, 4)
        
        tweet_score[doc] = score

        #merge the current page index with the main index
        for term_page, posting_page in current_tweet_index.items():
            index[term_page].append(posting_page)
            
    return index, tweet_score



def our_rank_documents(terms, docs, index, tweet_score):
    """
    Perform the ranking of the results of a search based on the tweet scores
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    tweet_score -- inverted document frequencies
    
    Returns:
    The result documents ranked from higher to lower doc_scores
    """

    doc_vectors = defaultdict(lambda: [0] * len(terms)) 
    query_vector = [0] * len(terms)
    query_terms_count = collections.Counter(terms)  
    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue       

        for doc_index, (doc, postings) in enumerate(index[term]):  
            query_vector[termIndex] =  query_terms_count[term]*tweet_score[doc][term]/len(terms)
            doc_vectors[doc][termIndex] = tweet_score[doc][term]

    
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = our_search(query, index)
    return result_docs

def our_search(query, index):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:                       
            term_docs=[posting[0] for posting in index[term]]
            docs = docs.union(term_docs)
        except:
            pass
    docs = list(docs)
    ranked_docs = our_rank_documents(query, docs, index, tweet_score)
    return ranked_docs

def our_rank_query(query):
    ranked_docs = our_search(query, index)
    top = 10

    print("\n======================\nTop {} results out of {} for".format(top, len(ranked_docs)), query,":\n")
    for d_id in ranked_docs[:top]:
        info_list = ["tweet","username","date","hashtags","likes","retweets","url"]
        print("DOC", d_id, "has been retrieved")
        for i in info_list:
            print(i,":",docs_info[d_id][i])
        print("\n")

In [9]:
start_time = time.time()
num_documents = len(my_dict)
index, tweet_score = create_our_score(my_dict, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 0.67 seconds


In [10]:
our_rank_query("Covid España ola")
#our_rank_query("how many covid cases")
#our_rank_query("mortalidad covid")
#our_rank_query("Covid prevention")
#our_rank_query("Pandemia mundial")


Top 10 results out of 38 for Covid España ola :

DOC 1959 has been retrieved
tweet : 💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉
💉💉💉💉                 💉💉💉💉
💉💉💉💉                 💉💉💉💉

COVID-19 vaccines     COVID-19 vaccines
in 10 countries             in the rest of the 🌍

#VaccinEquity is 🗝️ to ending the pandemic, together!

#WorldEmojiDay
username : World Health Organization (WHO)
date : Sat Jul 17 16:24:23 +0000 2021
hashtags : [{'text': 'VaccinEquity', 'indices': [163, 176]}, {'text': 'WorldEmojiDay', 'indices': [218, 232]}]
likes : 3486
retweets : 1517
url : https://twitter.com/WHO/status/1416433609091653633


DOC 1325 has been retrieved
tweet : 3 factors that can help you make safe choices when you're in an area of widespread #COVID19 transmission:

1️⃣ Location
2️⃣ Proximity with others
3️⃣ Time you spend there

COVID-19 advice for the public 👉https://t.co/auHlD1QoOX
https://t.co/8lAA9M4tOa
username : World Health Organization (WHO)
date : Sat Aug 14 04:19:49 +0000 2021
hashtags : [{'text': 'COVID19', 

# Part 3 Word2Vec Cosine Similarity

In [11]:
from gensim.models import Word2Vec

In [12]:
#creating the word2vec model for all words
model = Word2Vec.load("word2vec.model")
data = list(my_dict.values())
model = Word2Vec(data, min_count=1,workers=3, window =3, sg = 1)
print('Vocabulary size:', len(model.wv))

Vocabulary size: 6135


In [13]:
print(model.wv["covid"])

[-0.17750047  0.2753453   0.07210252  0.09172665 -0.0166772  -0.24623951
 -0.01476946  0.42040315 -0.07766458 -0.26529142 -0.02193881 -0.36936194
  0.02012352 -0.07556673  0.00131207 -0.1160517   0.1481008  -0.19787352
 -0.10737696 -0.47864127 -0.03075087  0.02236611  0.18150647 -0.14117697
 -0.15420394  0.190576   -0.22254181 -0.09717412  0.05581233 -0.096796
  0.22647312  0.11482931  0.08320782 -0.06375208 -0.08854322  0.07898424
  0.02796204 -0.07855964 -0.12936035 -0.32306537  0.14464046 -0.25103387
 -0.13584764 -0.02176412  0.06752764 -0.02452244 -0.08505324  0.10263211
  0.04678888  0.24983966  0.14914338 -0.22693233 -0.15400195  0.08527599
 -0.3661061   0.07176106  0.07126559 -0.22174682 -0.2969128   0.17361921
 -0.02904036  0.11960578  0.03548748 -0.03464152 -0.3053323   0.10955701
  0.12347759  0.2815211  -0.16700931  0.29589415 -0.19407175  0.14546491
  0.28786665 -0.16540863  0.36104494  0.0437502  -0.12143011 -0.03212233
 -0.14780216 -0.02381637 -0.29838827 -0.0263265  -0.2

In [14]:
# Function returning a 
tweet2vec = {}
def doc_2_vec(my_dict):
    """
    Creating a dictionary with each tweet's vector reperesentation from the word2vec model
    
    Argument:
    my_dict -- dictioanary of list of tearm, previously created and used
    
    Returns:
    tweet2vec -- dictionary with doc id as key and vector representation as value
    """
    for doc in my_dict:
        embeddings = []
        tweet_vec = []
        for token in doc:
            if token in model.wv:
                embeddings.append(model.wv.word_vec(token))
            else:
                embeddings.append(np.random.rand(100))
            # averaging all the vectors of individual words to get the vector of the tweet
        tweet_vec = np.mean(embeddings, axis=0)
        tweet2vec[doc] = tweet_vec
    return tweet2vec

#Function returning vector resprsentation of a query
def query_2_vec(query):
    """
    Creating the query's vector reperesentation from the word2vec model
    
    Argument:
    query -- input terms for which we what to search 
    
    Returns:
    query2vec -- vector representation of the query
    """
    embeddings = []
    for token in query:
        if token in model.wv:
            embeddings.append(model.wv.word_vec(token))
        else:
            embeddings.append(np.random.rand(100))
        # averaging all the vectors of individual words to get the vector of the tweet
    query2vec = np.mean(embeddings, axis=0)
    return query2vec

#in order to compute cosine similarity on non normalized vectors
from gensim import matutils
def similarity_cosine(vec1, vec2):
    cosine_similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))
    return cosine_similarity

In [15]:
def top_20(q):
    """
    This function first processes teh query and calls to the query vector represenattion function.
    Then it computes the cosine similarity on the tweets vectors and the query vector.
    Finaly, it prints the 20 most similar tweets.
    
    Argument:
    q -- the query, for which we what to search 
    
    """
    similarity = []
    top = 20
    query = build_terms(q) #text processing the query
    query2vec = query_2_vec(query) #converting query to vector
    for doc in tweet2vec:
        similarity.append([similarity_cosine(tweet2vec[doc] , query2vec), doc])
        
    similarity.sort(reverse=True)
    result_docs = [x[1] for x in similarity]
    
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = our_search(query, index)
   
    else:
        print("\n======================\nTop {} results out of {} for".format(top, len(result_docs)), q,":\n")
        for d_id in result_docs[:top]:
            info_list = ["tweet","username","date","hashtags","likes","retweets","url"]
            print("DOC", d_id, "has been retrieved")
            for i in info_list:
                print(i,":",docs_info[d_id][i])
            print("\n")

In [16]:
#creating the dictionary of tweet vectors
tweet2vec = doc_2_vec(my_dict)

  embeddings.append(model.wv.word_vec(token))


In [17]:
top_20("Covid España ola")
#top_20("how many covid cases")
#top_20("mortalidad covid")
#top_20("Covid prevention")
#top_20("Pandemia mundial")


Top 20 results out of 2399 for Covid España ola :

DOC 99 has been retrieved
tweet : If someone you know has #depression, here is how you can help:
💙 Encourage regular eating, sleeping and exercise
💙 Encourage them to focus on the positive, rather than the negative
💙 Be patient - recovery can take time

#WorldMentalHealthDay #LetsTalk https://t.co/u92OcTVT94
username : World Health Organization (WHO)
date : Sun Oct 10 07:16:08 +0000 2021
hashtags : [{'text': 'depression', 'indices': [24, 35]}, {'text': 'WorldMentalHealthDay', 'indices': [220, 241]}, {'text': 'LetsTalk', 'indices': [242, 251]}]
likes : 539
retweets : 246
url : https://twitter.com/WHO/status/1447098609447669761/photo/1


DOC 9 has been retrieved
tweet : RT @DrTedros: Broad administration of booster doses is unfair, unjust &amp; immoral at a time when #healthworkers &amp; at most risk people in many…
username : World Health Organization (WHO)
date : Tue Oct 12 21:01:40 +0000 2021
hashtags : [{'text': 'healthworkers', 'in

  embeddings.append(model.wv.word_vec(token))
