# Information Retrieval and Web Analytics: Indexing + Modeling (TF-IDF) 

In [1]:
# if you do not have nltk the following command should work "python -m pip install nltk" 
!pip install nltk
import nltk
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [28]:
import time
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import re
import csv
import sys

data_path = './data/one2.json'

# ## Uncoment if using google colab
from google.colab import drive
from google.colab.patches import cv2_imshow
drive.mount('/content/drive') 
data_path = '/content/drive/My Drive/Information retrieval/IR_WA_FinalProject-master/data/'

file_path = data_path + 'one100K_v2.json'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Preprocessing Data**

#### Load data into memory

The dataset ```dataset.jason``` contains a list of N tweets and its information. This dataset has ben made scrapping for tweets that contain any of the words "Trump", "#Trump", "Biden", "#Biden", "#UsElections2020".

In [29]:
docs_path = file_path
with open(docs_path) as fp:
    lines = fp.readlines()
tweets = [l.strip().replace(' +', ' ') for l in lines]

In [5]:
print("Total numer of tweets in the corpus: {}" .format(len(tweets)))

Total numer of tweets in the corpus: 100001


#### Clean Tweets
Preprocess the text of a concrete tweet removing non alphabetic characters, stop words, stemming, transforming in lowercase and return the tokens of the text.
    
    Argument:  tweetText -- string (text) to be preprocessed    
    Returns:   cleanText - a list of tokens corresponding to the tweetText after the preprocessing

In [7]:
def cleanTweet(tweetText):      
    stemming = PorterStemmer()
    stops = set(stopwords.words("english"))

    cleanText = tweetText.lower() ## Transform in lowercase
    cleanText = re.sub('[:\[\]&%$\"\'!./,;:?=¿^\-#_*+)<>(¡@]','',cleanText)
    cleanText = cleanText.split() ## Tokenize the text to get a list of terms
    cleanText = [word for word in cleanText if word not in stops]  ##eliminate the stopwords
    cleanText = [stemming.stem(word) for word in cleanText] ## perform stemming
    return cleanText
    

#### Crate a dictionary for each tweet
For each tweet, creates a dictionary containing the most relevant information of it (Username, OriginalText, Clean Tokens, number of Likes, number of retweets, list of URLs...)
    
    Argument:  tweet -- a JSON tweet content    
    Returns:   dictRelevantInfo -- a dictionary with the processed tweet

In [8]:
def getRelevantInfo(tweet):
    dictRelevantInfo ={}
    data = json.loads(tweet)
    hashtags = []
    urlsList = []
    text = ''
    date = data['created_at'] ## ??? RT o no RT
    try:
        isRt=True
        isRetweet=data["retweeted_status"]
        idTweet=isRetweet["id_str"]
        text = isRetweet['text']
        username = isRetweet['user']['screen_name']
        urls = isRetweet['entities']['urls']
        rt_count = isRetweet['retweet_count']
        likes = isRetweet['favorite_count']
        
        for h in isRetweet['entities']['hashtags']:
            hashtags.append(h['text'])
        for url in urls:
            urlsList.append(url['url'])
            
    except:
        isRt=False
        idTweet=data["id_str"]
        text = data['text']
        username = data['user']['screen_name']
        urls = data['entities']['urls']
        rt_count=data['retweet_count']
        likes = data['favorite_count']
        
        for h in data['entities']['hashtags']:
            hashtags.append(h['text'])
            
        for url in urls:
            urlsList.append(url['url'])        
            
    dictRelevantInfo['tweetID'] = idTweet
    dictRelevantInfo['text'] = text
    dictRelevantInfo['tokens'] = cleanTweet(text)
    dictRelevantInfo['username'] = username
    dictRelevantInfo['date'] = date
    dictRelevantInfo['hashtags'] = hashtags
    dictRelevantInfo['likes'] = likes
    dictRelevantInfo['rt_count'] = rt_count
    dictRelevantInfo['urlsList'] = urlsList
    dictRelevantInfo['isRetweeted'] = isRt
    return dictRelevantInfo

#### cleanTweets dict & Drop Duplicates
`Here we create a Dictionari (key::TweetID) of tweets. To do so, we iterate over the list of tweets from the dataset, preproces the tweet, and add it to the cleanTweets dictionary if it havent been added before (check for duplicates)`

In [9]:
cleanTweets = {}
for t in tweets:
    currentTweet = getRelevantInfo(t)
    tweetID = currentTweet['tweetID']
    # Orignial tweet found, add to the dict or overwrite if retweet already exist.
    if currentTweet['isRetweeted'] == False:
        cleanTweets[tweetID] = currentTweet
    else:
        if tweetID in cleanTweets:
            continue
        else:
            cleanTweets[tweetID] = currentTweet

In [10]:
print("Length of cleaned tweets: ",len(cleanTweets))

Length of cleaned tweets:  38592


## **Building the Search Engine**
    Argument:    terms -- list of query terms
                 docs -- list of documents, to rank, matching the query
                 index -- inverted index data structure
                 idf -- inverted document frequencies
                 tf -- term frequencies
                 titleIndex -- mapping between tweet id and tweet content
    
    Returns:     resultDocs -- list of tweetIDs in decreasing order and its score

### Creating tf-idf dictionary

Implement the inverted index and compute tf, df and idf



    Argument:   cleanTweets -- collection of tweets
                numTweets -- total number of tweets
    
    Returns:    index - the inverted index (implemented through a python dictionary) containing terms as keys and the corresponding 
                list of document these keys appears in (and the positions) as values.
                tf - normalized term frequency for each term in each document
                df - number of documents each term appear in
                idf - inverse document frequency of each term

In [16]:
def create_index_tfidf(cleanTweets, numTweets):
    # lines -> cleanTweets
    # numDocs -> numOfTweets
        
    index=defaultdict(list)
    tf=defaultdict(list)        #term frequencies of terms in tweets
    df=defaultdict(int)         #document frequencies of terms in the corpus
    titleIndex=defaultdict(str)
    idf=defaultdict(float)

    tweetIndex=defaultdict(float)
    
    for line in cleanTweets.values(): # Remember, cleanTweets contain all tweets, each line is a tweet
        tweetID = line['tweetID']        
        terms = line['tokens']
        tweetIndex[tweetID]=line['text'] 

        termdictPage={}
        for position, term in enumerate(terms): ## terms contains all the tokens of the actual tweet
            try:
                # if the term is already in the dict append the position to the corrisponding list
                termdictPage[term][tweetID].append(position) 
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                termdictPage[term]=[tweetID, array('I',[position])] #'I' indicates unsigned int (int in python)
        
        # normalize term frequencies
        # Compute the denominator to normalize term frequencies
        # norm is the same for all terms of a document.
        norm=0
        for term, posting in termdictPage.items(): 
            # posting is a list containing tweetID and the list of positions for current term in current tweet: 
            # posting ==> [tweetID, [list of positions]] 
            norm+=len(posting[1])**2
        norm=math.sqrt(norm)

        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in termdictPage.items():     
            # append the tf for current term (tf = term frequency in current tweet/norm)
            tf[term].append(np.round(len(posting[1])/norm,4))
            #increment the tweet frequency of current term (number of tweets containing the current term)
            df[term]= len(termdictPage[term])  # increment df for current term
        
        #merge the current page index with the main index
        for termpage, postingpage in termdictPage.items():
            index[termpage].append(postingpage)
            
    # Compute idf
    for term in df:
        idf[term] = np.round(np.log(float(numTweets/df[term])),4)
            
    return index, tf, df, idf, tweetIndex


In [17]:
start_time = time.time()
numTweets = len(cleanTweets)
index, tf, df, idf, tweetIndex = create_index_tfidf(cleanTweets, numTweets)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time,2)))

Total time to create the index: 5.95 seconds


### Ranking tweets based on TF-IDFs + Cosine Similarity


In [18]:
def rankDocuments_TFIDF(terms, docs, index, idf, tf, titleIndex):
            
    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaing elements would became 0 when multiplied to the queryVector
    docVectors=defaultdict(lambda: [0]*len(terms)) # I call docVectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary 
    queryVector=[0]*len(terms)    

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms) # get the frequency of each term in the query. 
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    
    query_norm = la.norm(list(query_terms_count.values()))
    
    for termIndex, term in enumerate(terms): #termIndex is the index of the term in the query
        if term not in index:
            continue
                    
        ## Compute tf*idf(normalize tf as done with documents)
        queryVector[termIndex]=query_terms_count[term]/query_norm * idf[term]

        # Generate docVectors for matching docs
        for docIndex, (doc, postings) in enumerate(index[term]):
            if doc in docs:
                docVectors[doc][termIndex]=tf[term][docIndex] * idf[term]

    # calculate the score of each tweet
    # compute the cosine similarity between queyVector and each docVector:    
    docScores=[ [np.dot(curDocVec, queryVector)/(np.linalg.norm(curDocVec)*np.linalg.norm(queryVector)), doc] for doc, curDocVec in docVectors.items() ]
    docScores.sort(reverse=True)
    resultDocs=[(x[0], x[1]) for x in docScores]
    if len(resultDocs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)  
    return resultDocs

### Ranking tweets based on TF-IDFs + Likes + Retweets + Cosine Similarity

In [21]:
def createLikeRTIndex(cleanTweets):
    # maxLikes = 0
    # maxRT = 1
    # for tweet in lines:
    #     if lines[tweet]['likes'] > maxLikes:
    #         maxLikes = lines[tweet]['likes']
    #     if lines[tweet]['rt_count'] > maxRT:
    #         maxRT = lines[tweet]['rt_count']
    for tweet in cleanTweets:
        cleanTweets[tweet]['likes_score'] = (-np.exp(-(cleanTweets[tweet]['likes']/50000))+1)
        cleanTweets[tweet]['rt_score'] = (-np.exp(-(cleanTweets[tweet]['rt_count']/25000))+1)

In [22]:
def rankDocuments_Likes_Retweets(terms, docs, index, idf, tf, titleIndex):
            
    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaing elements would became 0 when multiplied to the queryVector
    docVectors=defaultdict(lambda: [0]*len(terms)) # I call docVectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary 
    queryVector=[0]*len(terms)    

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms) # get the frequency of each term in the query. 
    
    query_norm = la.norm(list(query_terms_count.values()))
    
    for termIndex, term in enumerate(terms): #termIndex is the index of the term in the query
        if term not in index:
            continue
                    
        ## Compute tf*idf(normalize tf as done with documents)
        queryVector[termIndex]=query_terms_count[term]/query_norm * idf[term]

        # Generate docVectors for matching docs
        for docIndex, (doc, postings) in enumerate(index[term]):
            
            if doc in docs:
                docVectors[doc][termIndex]=tf[term][docIndex] * idf[term]

    # calculate the score of each tweet
    # compute the cosine similarity between queyVector and each docVector:
       
    docScores=[ [(np.dot(curDocVec, queryVector)/(np.linalg.norm(curDocVec)*np.linalg.norm(queryVector)))*0.6+cleanTweets[doc]['likes_score']*0.2+cleanTweets[doc]['rt_score']*0.2, doc] for doc, curDocVec in docVectors.items() ]
    docScores.sort(reverse=True)
    resultDocs=[(x[0], x[1]) for x in docScores]
    #print document titles instead if document id's
    #resultDocs=[ titleIndex[x] for x in resultDocs ]
    if len(resultDocs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)    
    #print ('\n'.join(resultDocs), '\n')
    return resultDocs

## **Search**

In [23]:
def search_tf_idf(query, index, ranking_type = '0'):
    '''
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    '''
    query=cleanTweet(query)
    docs=set()
    for term in query:
        try:
            # store in termDocs the ids of the docs that contain "term"                        
            termDocs=[posting[0] for posting in index[term]]
            
            # docs = docs Union termDocs
            docs = docs.union(termDocs)
        except:
            #term is not in index
            pass
    docs=list(docs)
    if ranking_type == '0': # TF-IDF
        ranked_docs = rankDocuments_TFIDF(query, docs, index, idf, tf, tweetIndex)   
    elif ranking_type == '1': # TF-IDF + Likes + Retweets
        ranked_docs = rankDocuments_Likes_Retweets(query, docs, index, idf, tf, tweetIndex)
    return ranked_docs

In [None]:
print("Insert your query:\n")
query = input()
ranked_docs = search_tf_idf(query, index, ranking_type = '0')    
top = 10

print("\n======================\nTop {} results out of {} for the seached query:\n".format(top, len(ranked_docs)))
for d_score, d_id in ranked_docs[:top] :
    print("Tweet ID= {}\nTweet: {}\nScore: {}\n".format(d_id, tweetIndex[d_id], round(d_score, 4)))

## Store TOP-20 tweets from the query

In [41]:
list_of_queries = ['GOP wins elections?',
                   'The symbol of the GOP is an elephant.',
                   'Trump says elections have been corrupted',
                   'Biden takes the action.',
                   'fraud republican election',
                   'Kamala Harris will be the first woman Vice President of the US.',
                   'Biden will fight climate change',
                   'Can I see elections on netflix?',
                   'thanks realdonaldtrump',
                   "Trump's attempt to steal the election unravels as coronavirus cases surge"]
top = 20
for i in range(10):
    query = list_of_queries[i]
    ranked_docs = search_tf_idf(query, index, ranking_type = '0')    

    with open(data_path+'output/Query_'+str(i)+'.tsv', 'wt') as out_file:
      tsv_writer = csv.writer(out_file, delimiter='\t')
      tsv_writer.writerow(['Ranking:', 'Tweet:', 'Score:'])
      for i, (d_score, d_id) in enumerate(ranked_docs[:top]):
          tsv_writer.writerow([i, tweetIndex[d_id], round(d_score, 4)])