# PRACTICE

#### Vladimir Trukhaev & Ingrid Sancho

In [1]:
#imports 
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import json
from collections import Counter

In [2]:
#updating/downloading stop words
nltk.download('stopwords')
#reading data
doc = "dataset_tweets_WHO.txt"
with open(doc, 'r') as file:
    data = json.load(file)
    
#initializing dictionary "my_dict" where value is the tweet text and key its id
keylist = []
for key in data:
    keylist.append(key)

my_dict = {}
docs_info = {}

for i in keylist:
    my_dict[i] = None
    docs_info[i] = None
    
for key in data:
    #initializing my_dict
    tweet = []
    for i in data[key]["full_text"]:
        tweet.append(i)
    tweet1 = "".join(tweet)
    my_dict[key] = tweet1
    
    #creting docs_info
    tweet = data[key]["full_text"]
    username = data[key]["user"]["name"]
    date = data[key]["created_at"]
    hashtags = data[key]["entities"]["hashtags"]
    likes = data[key]["favorite_count"]
    retweets = data[key]["retweet_count"]
    try:
        url = data[key]["entities"]["media"][0]["expanded_url"]
    except: #sometimes we weren't able to find the url in the data, then:
        url = "https://twitter.com/WHO/status/%s" % (data[key]["id_str"])
    
    info = {"tweet": tweet, "username": username, 
            "date": date, "hashtags": hashtags, 
            "likes": likes,"retweets": retweets, "url": url}
    docs_info[key] = info
    
 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ingrid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Part 1: Text Processing

In [3]:
def lowering(d):
    """
    Transforming tweet text (values in dictionary) in lowercase
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with tweets transformed to lowercase as values
    """
    for key in d:
        d[key] = d[key].lower()
    return d

def cleaning(d):
    """
    Removing anything that is not alphanumeric
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with tweets without any non alphanumeric character
    """
    for key in d:
        d[key] = ["".join(re.sub(r'[^A-Za-z0-9 #]', ' ', i) for i in d[key])]
    return d

def tokenize(d):
    """
    Tokenizing the tweets, in other words, splitting text by "words"
    
    Argument:
    d -- dictionary where tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values
    """
    for key in d:
        for sentence in d[key]:
            d[key] = sentence.split()
    return d

def stpwords(d):
    """
    Removing stopwords, which are very common words that do not contain meaning
    
    Argument:
    d -- dictionary where list of words of tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values, now with no stopwords
    """
    languages = ["english", "spanish", "french"]
    for language in languages:
        stop_words = set(stopwords.words(language))
        for key in my_dict:
            my_dict[key] = [word for word in my_dict[key] if word not in stop_words]
    return d

def stemming(d):
    """
    Stemming tweets, which means to keep only the "root" of each word
    
    Argument:
    d -- dictionary where list of words of tweets are stored as values
    
    Returns:
    d -- dictionary with lists of words as values, now stemmed words
    """
    stemmer = PorterStemmer()
    for key in my_dict.keys():
        my_dict[key] = [stemmer.stem(word) for word in my_dict[key]]
    return d

In [4]:
#running every function for our dictionary of tweets "my_dict"
my_dict = lowering(my_dict)
my_dict = cleaning(my_dict)
my_dict = tokenize(my_dict)
my_dict = stpwords(my_dict)
my_dict = stemming(my_dict)

# Part 2:Indexing and Evaluation

## 11/11/21

#### Vladimir Trukhaev & Ingrid Sancho

### 2. Indexing

In [5]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time

#### 2.1 Index Implementation

In [6]:
def create_index(my_dict):
    """
    Implement the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a Python dictionary) containing terms as keys and the corresponding
    list of documents where these keys appears in (and the positions) as values.
    """
    index = defaultdict(list)
    title_index = {}  # dictionary to map page titles to page ids
     ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is 
        ##"web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================
    for doc in my_dict:
        current_page_index = {}
        for position, term in enumerate(my_dict[doc]): # terms contains page_title + page_text. Loop over all terms
            try:
                  # if the term is already in the index for the current page (current_page_index)
                  # append the position to the corresponding list
          ## START CODE
                   current_page_index[term][1].append(position)  
            except:
                  # Add the new term as dict key and initialize the array of positions and add the position
                    current_page_index[term]=[doc, array('I',[position])] #'I' indicates unsigned int (int in Python)
              
          #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)
          
          ## END CODE                    
                    
    return index

In [7]:
import time

start_time = time.time()
index = create_index(my_dict)
print("Total time to create the index: {} seconds".format(np.round(time.time() - start_time, 2 )))

Total time to create the index: 0.09 seconds


In [8]:
print("Index results for the term 'researcher': {}\n".format(index['researcher']))
print("First 10 Index results for the term 'research': \n{}".format(index['research']))

Index results for the term 'researcher': []

First 10 Index results for the term 'research': 
[['22', array('I', [17])], ['153', array('I', [21])], ['171', array('I', [17])], ['203', array('I', [5])], ['210', array('I', [10])], ['211', array('I', [7])], ['221', array('I', [6])], ['422', array('I', [12])], ['428', array('I', [4])], ['459', array('I', [23])], ['814', array('I', [0])], ['960', array('I', [4])], ['1246', array('I', [14])], ['1341', array('I', [9])], ['1383', array('I', [3, 13])], ['1384', array('I', [14])], ['1389', array('I', [6])], ['1391', array('I', [10])], ['1392', array('I', [22])], ['1393', array('I', [5])], ['1394', array('I', [15])], ['1419', array('I', [26])], ['1611', array('I', [13])], ['1630', array('I', [2])], ['1784', array('I', [7])], ['1941', array('I', [4])], ['1972', array('I', [11])], ['2063', array('I', [15, 23])], ['2101', array('I', [10])], ['2103', array('I', [10])], ['2324', array('I', [12])]]


#### 2.2 Querying the Index

In [9]:
def build_terms(line):
    """
    Preprocess the article text (title + body) removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    ## START CODE
    line=  line.lower() ## Transform in lowercase
    line=  line.split() ## Tokenize the text to get a list of terms
    line=[word for word in line if not word in stop_words]  ##eliminate the stopwords (HINT: use List Comprehension)
    line=[stemmer.stem(word) for word in line] ## perform stemming (HINT: use List Comprehension)
    ## END CODE
    return line

In [10]:
def search(query, index):
    """
    The output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
    ## START DODE
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs=[posting[0] for posting in index[term]]
            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    return docs

In [11]:
#defining docs_info from original data
doc = "dataset_tweets_WHO.txt"
with open(doc, 'r') as file:
    data = json.load(file)
    
#initializing dictionary "my_dict" where value is the tweet text and key its id
keylist = []
for key in data:
    keylist.append(key)

docs_info = {}
    
for i in keylist:
    docs_info[i] = None
    
for key in data:
    tweet = data[key]["full_text"]
    username = data[key]["user"]["name"]
    date = data[key]["created_at"]
    hashtags = data[key]["entities"]["hashtags"]
    likes = data[key]["favorite_count"]
    retweets = data[key]["retweet_count"]
    try:
        url = data[key]["entities"]["media"][0]["expanded_url"]
    except: #sometimes we weren't able to find the url in the data, then:
        url = "https://twitter.com/WHO/status/%s" % (data[key]["id_str"])
    
    info = {"tweet": tweet, "username": username, 
            "date": date, "hashtags": hashtags, 
            "likes": likes,"retweets": retweets, "url": url}
    docs_info[key] = info
    

In [12]:
print("Insert your query (i.e.: Computer Science):\n")
query = input()
docs = search(query, index)
top = 10

print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(docs)))
for d_id in docs[:top]:
    info_list = ["tweet","username","date","hashtags","likes","retweets","url"]
    print("DOC", d_id, "has been retrieved")
    for i in info:
        print(i,":",docs_info[d_id][i])
    print("\n")

Insert your query (i.e.: Computer Science):

Covid help

Sample of 10 results out of 181 for the searched query:

DOC 1264 has been retrieved
tweet : @DrTedros "Our colleagues at @pahowho have helped send emergency medical teams &amp; supplies. They have also been on the ground supporting the authorities with an assessment of the damage to health facilities. Four have been destroyed &amp; a further 20 damaged"-@DrTedros https://t.co/BTTucE6d3d
username : World Health Organization (WHO)
date : Wed Aug 18 13:31:48 +0000 2021
hashtags : []
likes : 57
retweets : 20
url : https://twitter.com/WHO/status/1427986589842608130


DOC 1453 has been retrieved
tweet : While #COVID19 cases are slowing down in some part of the world, WHO steps up the fight to help countries still facing surges in infections and new challenges.

Read more 👉https://t.co/LXb0xnuP7i #WHOImpact https://t.co/aluig4piUX
username : World Health Organization (WHO)
date : Sat Aug 07 12:56:52 +0000 2021
hashtags : [{'text': 'COV

#### 2.3. Add Ranking with TF-IDF

In [13]:
def create_index_tfidf(my_dict, num_documents):
    """
    Implement the inverted index and compute tf, df and idf
    
    Argument:
    lines -- collection of Wikipedia articles
    num_documents -- total number of documents
    
    Returns:
    index - the inverted index (implemented through a Pyhon dictionary) containing terms as keys and the corresponding
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """

    index = defaultdict(list)
    tf = defaultdict(list)  #term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  #document frequencies of terms in the corpus
    idf = defaultdict(float)

     ## ===============================================================        
        ## create the index for the current page and store it in current_page_index (current_page_index)
        ## current_page_index ==> { ‘term1’: [current_doc, [list of positions]], ...,‘term_n’: [current_doc, [list of positions]]}

        ## Example: if the curr_doc has id 1 and his text is 
        ##"web retrieval information retrieval":

        ## current_page_index ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}

        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================
    for doc in my_dict:
        current_page_index = {}
        for position, term in enumerate(my_dict[doc]): # terms contains page_title + page_text. Loop over all terms
            try:
                  # if the term is already in the index for the current page (current_page_index)
                  # append the position to the corresponding list
          ## START CODE
                   current_page_index[term][1].append(position)  
            except:
                  # Add the new term as dict key and initialize the array of positions and add the position
                    current_page_index[term]=[doc, array('I',[position])] #'I' indicates unsigned int (int in Python)
              
        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm = 0
        for term, posting in current_page_index.items():
            # posting will contain the list of positions for current term in current document. 
            # posting ==> [current_doc, [list of positions]] 
            # you can use it to infer the frequency of current term.
            norm += len(posting) ** 2
        norm = math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in current_page_index.items():
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting)/norm,4)) ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term] = tf[term] # increment DF for current term

        #merge the current page index with the main index
        for term_page, posting_page in current_page_index.items():
            index[term_page].append(posting_page)

        # Compute IDF following the formula (3) above. HINT: use np.log
        for term in df:
            idf[term] = np.round(np.log(float(num_documents/len(df[term]))), 4)

    return index, tf, df, idf

In [14]:
start_time = time.time()
num_documents = len(my_dict)
index, tf, df, idf = create_index_tfidf(my_dict, num_documents)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 84.14 seconds


In [19]:
index

defaultdict(list,
            {'intern': [['0', array('I', [0])],
              ['2', array('I', [0])],
              ['35', array('I', [1])],
              ['262', array('I', [0])],
              ['273', array('I', [0])],
              ['277', array('I', [0])],
              ['278', array('I', [1])],
              ['280', array('I', [1])],
              ['299', array('I', [0])],
              ['338', array('I', [0])],
              ['359', array('I', [18])],
              ['363', array('I', [10])],
              ['389', array('I', [16])],
              ['450', array('I', [7, 14])],
              ['458', array('I', [6])],
              ['498', array('I', [7])],
              ['553', array('I', [10])],
              ['584', array('I', [0])],
              ['677', array('I', [2])],
              ['761', array('I', [8])],
              ['776', array('I', [7])],
              ['779', array('I', [3])],
              ['856', array('I', [8])],
              ['881', array('I', [0])],
         

In [15]:
def rank_documents(terms, docs, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    
    Returns:
    Print the list of ranked documents
    """

    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaining elements would became 0 when multiplied to the query_vector
    doc_vectors = defaultdict(lambda: [0] * len(terms)) # I call doc_vectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary
    query_vector = [0] * len(terms)

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms)  # get the frequency of each term in the query. 
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    #HINT: use when computing tf for query_vector

    query_norm = la.norm(list(query_terms_count.values()))

    for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
        if term not in index:
            continue
  
        ## Compute tf*idf(normalize TF as done with documents)
        query_vector[termIndex] =  query_terms_count[term]/len(terms)*idf[term]

        # Generate doc_vectors for matching docs
        for doc_index, (doc, postings) in enumerate(index[term]):
            # Example of [doc_index, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....

            #tf[term][0] will contain the tf of the term "term" in the doc 26            
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] * idf[term]  # TODO: check if multiply for idf

    # Calculate the score of each doc 
    # compute the cosine similarity between queyVector and each docVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine similarity
    # see np.dot
    
    doc_scores=[[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items() ]
    doc_scores.sort(reverse=True)
    result_docs = [x[1] for x in doc_scores]
    #print document titles instead if document id's
    #result_docs=[ title_index[x] for x in result_docs ]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)
    #print ('\n'.join(result_docs), '\n')
    return result_docs

In [16]:
def search_tf_idf(query, index):
    """
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    """
    query = build_terms(query)
    docs = set()
    for term in query:
        try:
            # store in term_docs the ids of the docs that contain "term"                        
            term_docs=[posting[0] for posting in index[term]]
            
            # docs = docs Union term_docs
            docs = docs.union(term_docs)
        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs = rank_documents(query, docs, index, idf, tf)
    return ranked_docs

In [17]:
print("Insert your query (i.e.: Computer Science):\n")
query = input()
ranked_docs = search_tf_idf(query, index)
top = 10

print("\n======================\nTop {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for d_id in ranked_docs[:top]:
    info_list = ["tweet","username","date","hashtags","likes","retweets","url"]
    print("DOC", d_id, "has been retrieved")
    for i in info:
        print(i,":",docs_info[d_id][i])
    print("\n")

Insert your query (i.e.: Computer Science):

civid HELP

Top 10 results out of 151 for the searched query:

DOC 1868 has been retrieved
tweet : Getting vaccinated 💉 against #COVID19 helps protect you from getting sick. https://t.co/9Ad3uW0NsN
username : World Health Organization (WHO)
date : Wed Jul 21 15:14:10 +0000 2021
hashtags : [{'text': 'COVID19', 'indices': [29, 37]}]
likes : 366
retweets : 155
url : https://twitter.com/WHO/status/1417865492333621253/photo/1


DOC 1469 has been retrieved
tweet : Getting vaccinated 💉 can help protect you and those around you from #COVID19 ⬇️ https://t.co/3GrojOAWT6
username : World Health Organization (WHO)
date : Fri Aug 06 10:29:12 +0000 2021
hashtags : [{'text': 'COVID19', 'indices': [68, 76]}]
likes : 253
retweets : 117
url : https://twitter.com/WHO/status/1423591981456838656/photo/1


DOC 61 has been retrieved
tweet : RT @WHOPhilippines: Vaccines can’t stop #COVID19 alone, but by doing it all we can help protect ourselves and our loved ones 

In [18]:
terms = ["hi", "i", "i", "am" ,"potato", "am", "am"]
query_terms_count = collections.Counter(terms)

query_norm = la.norm(list(query_terms_count.values()))

for termIndex, term in enumerate(terms):  #termIndex is the index of the term in the query
    if term not in index:
        continue
  
 ## Compute tf*idf(normalize TF as done with documents)
    #query_vector[termIndex] =  query_terms_count[term]
    
print(query_terms_count["am"])

3
