# Information Retrieval and Web Analytics: Indexing + Modeling (TF-IDF) 

In [1]:
# if you do not have nltk the following command should work "python -m pip install nltk" 
import nltk
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to C:\Users\David
[nltk_data]     Gayete\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import time
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import re
import csv
import sys

from gensim.summarization.bm25 import BM25
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import preprocess_string
import nltk
nltk.download('punkt') # used in sent_tokenize

data_path = './data/one100K_v2.json'

[nltk_data] Downloading package punkt to C:\Users\David
[nltk_data]     Gayete\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Load data into memory

As mentioned above the dataset is stored in a tsv file ```parsed_input_500.tsv```and it contains 500 Wikipedia article (one article per line). For each article we have id, title and body separated by "|".

In [3]:
docs_path = data_path
with open(docs_path) as fp:
    lines = fp.readlines()
tweets = [l.strip().replace(' +', ' ') for l in lines]

In [4]:
print("Total numer of tweets in the corpus: {}" .format(len(tweets)))

Total numer of tweets in the corpus: 100001


In [5]:
def cleanTweet(tweetText):
    """
    Preprocess the article text (title + body) removing stop words, stemming,
    transforming in lowercase and return the tokens of the text.
    
    Argument:
    line -- string (text) to be preprocessed
    
    Returns:
    line - a list of tokens corresponding to the input text after the preprocessing
    """
        
    stemming = PorterStemmer()
    stops = set(stopwords.words("english"))
    ## START CODE
    line= tweetText.lower() ## Transform in lowercase
    line=re.sub('[:\[\]&%$\"\'!./,;:?=¿^\-#_*+)<>(¡@]','',line)
    line= line.split() ## Tokenize the text to get a list of terms
    line= [word for word in line if word not in stops]  ##eliminate the stopwords (HINT: use List Comprehension)
    line= [stemming.stem(word) for word in line] ## perform stemming (HINT: use List Comprehension)
    ## END CODE
    return line
    

In [6]:
def getRelevantInfo(tweet):
    dictRelevantInfo ={}
    data = json.loads(tweet)
    hashtags = []
    urlsList = []
    text = ''
    date = data['created_at'] ## ??? RT o no RT
    try:
        isRt=True
        isRetweet=data["retweeted_status"]
        idTweet=isRetweet["id_str"]
        text = isRetweet['text']
        username = isRetweet['user']['screen_name']
        urls = isRetweet['entities']['urls']
        rt_count = isRetweet['retweet_count']
        likes = isRetweet['favorite_count']
        
        for h in isRetweet['entities']['hashtags']:
            hashtags.append(h['text'])
        for url in urls:
            urlsList.append(url['url'])
            
    except:
        isRt=False
        idTweet=data["id_str"]
        text = data['text']
        username = data['user']['screen_name']
        urls = data['entities']['urls']
        rt_count=data['retweet_count']
        likes = data['favorite_count']
        
        for h in data['entities']['hashtags']:
            hashtags.append(h['text'])
            
        for url in urls:
            urlsList.append(url['url'])        
            
    dictRelevantInfo['tweetID'] = idTweet
    dictRelevantInfo['text'] = text
    dictRelevantInfo['tokens'] = cleanTweet(text)
    dictRelevantInfo['username'] = username
    dictRelevantInfo['date'] = date
    dictRelevantInfo['hashtags'] = hashtags
    dictRelevantInfo['likes'] = likes
    dictRelevantInfo['rt_count'] = rt_count
    dictRelevantInfo['urlsList'] = urlsList
    dictRelevantInfo['isRetweeted'] = isRt
    return dictRelevantInfo

In [7]:
cleanTweets = {}
for t in tweets:
    currentTweet=getRelevantInfo(t)
    tweetID=currentTweet['tweetID']
    isRtCurrent=currentTweet['isRetweeted']
    # Orignial tweet found, overwrite if retweet exist.
    if isRtCurrent == False:
        cleanTweets[tweetID] = currentTweet
    else:
        if tweetID in cleanTweets:
            continue
        else:
            cleanTweets[tweetID] = currentTweet

In [8]:
print("Length of cleaned tweets: ",len(cleanTweets))

Length of cleaned tweets:  38592


In [9]:
def create_index(lines):
    """
    Impleent the inverted index
    
    Argument:
    lines -- collection of Wikipedia articles
    
    Returns:
    index - the inverted index (implemented through a python dictionary) containing terms as keys and the corresponding 
    list of document these keys appears in (and the positions) as values.
    """
    index=defaultdict(list) 
    tweetIndex = {} # dictionary to map tweets to page ids
    for line in lines.values(): # Remember, lines contain all tweets, each line is a tweet
        tweetID = line['tweetID']
        terms = line['tokens'] #page_title + page_text
        tweetIndex[tweetID]=line['text']  ## we do not need to apply get terms to title because it used only to print titles and not in the index
        
        ## ===============================================================        
        ## create the index for the current doc and store it in termdictPage
        ## termdictPage ==> { ‘term1’: [currentdoc, [list of positions]], ...,‘termn’: [currentdoc, [list of positions]]}
        
        ## Example: if the curr_doc has id 1 and his text is 
        ## "web retrieval information retrieval":
        
        ## termdictPage ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}
        
        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================
        
        termdictPage={}

        for position, term in enumerate(terms): # terms contains page_title + page_text. Loop over all terms
            try:
                # if the term is already in the index for the current page (termdictPage)
                # append the position to the corrisponding list
                
        ## START CODE
                termdictPage[term][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                termdictPage[term]=[tweetID, array('I',[position])] #'I' indicates unsigned int (int in python)
            
        #merge the current page index with the main index
        for termpage, postingpage in termdictPage.items():
            index[termpage].append(postingpage)
        
        ## END CODE                    
                    
    return index, tweetIndex

In [10]:
start_time = time.time()

# index is the dict of , index by twee ?¿?¿
# tweetIndex is the dict of tweets, indexed by tweetID
index, tweetIndex = create_index(cleanTweets)

print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time,2)))

Total time to create the index: 0.64 seconds


Notice that if you look in the index for ```researcher```you will not find any result, while if you look for ```research``` you will get some results. That happens because we are storing in the index stemmed terms.

In [11]:
#print("Index results for the term 'biden': {}\n".format(index['biden']))
print("Length of Index results for the term 'biden': {}\n".format(len(index['obama'])))
print("First 10 Index results for the term 'biden': \n{}".format(index['biden'][:10]))

Length of Index results for the term 'biden': 299

First 10 Index results for the term 'biden': 
[['1334882742694273027', array('I', [0])], ['1334885914921762816', array('I', [3])], ['1334575978933260297', array('I', [0])], ['1334621442751799297', array('I', [1, 10])], ['1334878351970938883', array('I', [9])], ['1334851688528801793', array('I', [5])], ['1334651632110489601', array('I', [8])], ['1334700028129112065', array('I', [1])], ['1334885916800921604', array('I', [5])], ['1334882285955538947', array('I', [6])]]


In [12]:
def getResultsFromIndex(word, index):
    stemming = PorterStemmer()
    newWord=word.lower() ## Transform in lowercase
    newWord=re.sub('[:\[\]&%$\"\'!./,;:?=¿^\-#_*+)<>(¡@]','',word)
    newWord= stemming.stem(word) ## perform stemming
    print("Length of Index results for the term '{}': {}".format(newWord, len(index[newWord])))
    print("First 10 Index results for the term '{}': \n{}\n\n".format(newWord, index[newWord][:10]))

In [13]:
getResultsFromIndex("trump", index)
getResultsFromIndex("biden", index)
getResultsFromIndex("joe", index)

Length of Index results for the term 'trump': 13747
First 10 Index results for the term 'trump': 
[['1334885914112364552', array('I', [4])], ['1334870866522296322', array('I', [7])], ['1334885914724737027', array('I', [8])], ['1334885915005771776', array('I', [4, 7])], ['1334621442751799297', array('I', [13])], ['1334861024076423168', array('I', [2])], ['1334885915441963010', array('I', [1])], ['1334641634408017920', array('I', [1])], ['1334623531477381122', array('I', [1])], ['1334885916033261569', array('I', [7])]]


Length of Index results for the term 'biden': 6343
First 10 Index results for the term 'biden': 
[['1334882742694273027', array('I', [0])], ['1334885914921762816', array('I', [3])], ['1334575978933260297', array('I', [0])], ['1334621442751799297', array('I', [1, 10])], ['1334878351970938883', array('I', [9])], ['1334851688528801793', array('I', [5])], ['1334651632110489601', array('I', [8])], ['1334700028129112065', array('I', [1])], ['1334885916800921604', array('I', [5

### Querying the Index

Even if before we mentioned that in case of phrase queries we need to take into account the position of the terms in the document and we have implemented an index that would allow us to also work with this type of queries, here you are going to implement a search function that will query the index without take into account the trems' positions.


We will use english Free Text Queries, that means that the query we will query the index using  a sequence of english words as query, and the output will be the list of documents that contain any of the query terms. 

For instance if we write the query **"computer science"** the output will be the union of all documents containing the term "computer" with all documents containing the term "science".

In [14]:
def search(query, index):
    '''
    The output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    '''
    query=cleanTweet(query)
    tweets=set()
    for term in query:
    ## START DODE
        try:
            # store in termTweets the ids of the tweets that contain "term"                        
            termTweets=[posting[0] for posting in index[term]]
            # tweets = tweets Union termTweets
            tweets = tweets.union(termTweets)
        except:
            #term is not in index
            pass
    tweets=list(tweets)
    return tweets

In [15]:
print("Insert your query:\n")
query = input()
docs = search(query, index)    
top = 10

print("\n======================\nSample of {} results out of {} for the seached query:\n".format(top, len(docs)))
for d_id in docs[:top] :
    print("tweet_id= {}\ntweet_text: {}\n\n\n".format(d_id, tweetIndex[d_id]))

Insert your query:

biden trump

Sample of 10 results out of 84619 for the seached query:

tweet_id= 1332013866918813697
tweet_text: Biden is doing it❗️don’t worry it’s all going to be over soon 😜



tweet_id= 1332033113862545408
tweet_text: Rumor has it that Donald Trump has a new reality TV show starting production at the end of January. It will be call… https://t.co/FvPfVbSdF3



tweet_id= 1330444762726486027
tweet_text: @gaijingirl2004 @truthout He's obviously confused Trump w/ his ol' warmonger pal GeeDubya.



tweet_id= 1332101134568333316
tweet_text: Tulsi Gabbard Urges Donald Trump to Pardon Edward Snowden and Julian Assange https://t.co/Yi5wpVlc4x https://t.co/WRE8Nm05J7



tweet_id= 1332024801318211586
tweet_text: @mitchellscomet @coldlawgic I voted for Biden despite his milquetoast-ass policies, because I calculated neoliberal… https://t.co/BXwtSe0vOE



tweet_id= 1332007772418940936
tweet_text: @DanScavino And Joe Biden almost returns the peasantry https://t.co/4czeV8NjSp



### Ranking tf-idf

When searching in a search engine, we are interested in obtain the results sorted by relevance or by some other criteria. Notice that **the above results are not ranked**.

Here you are going to implement **tf-idf (Term Frequency — Inverse Document Frequency)** and use it to obtain a list of ordered results.

Tf-idf is a weighting scheme that assigns each term in a document a weight based on its term frequency (tf) and inverse document frequency (idf).  The higher the scores, more important the term is. 

##### TF
**tf** refers to the frequency of a term $t$ in a specific document $d$. The basic idea is that as a term appears more in the document it becomes more important. On the other side, if we only use pure term counts, longer documents will be favored more. Consider two documents with exactly the same content but one being twice longer by concatenating with itself.  The tf weights of each word in the longer document will be twice the shorter one, although they essentially have the same content. To deal with this issue we need to **normalize the term frequencies**.

$$tf_{t,d} = \dfrac{N_{t,d}}{||D||}\tag{1}$$



where ||D|| is the Euclidean norm. 


Let $D=[t_1, t_2, \dots, t_n]$ be the document vector where $t_i$ represent the frequency of the term $i$, the  Euclidean Norm is calculated as

$$\sqrt{\sum_{t=1}^{n}t_i{^2}}\tag{2}$$

Note that $||D||$ is the same for all terms of a document.


##### IDF
A drawback of tf is that it considers all terms equally important. However, less common terms are more discriminative than others. To deal with this issue we introduce **idf (inverse document frequency)** that takes into account the number of documents containing the term.

$$idf_t = log\dfrac{N}{df_t}\tag{3}$$

where:

- $N$ is the total number of documents;
- $df_t$ is the number of ocuments containing the term $t$.

The log operation is applied to avoid that terms that appears in a high number of documents are considered to be too much less important, in this way we are smoothing (dampening) this difference.


In [15]:
def create_index_tfidf(lines, numDocuments):
    """
    Implement the inverted index and compute tf, df and idf
    
    Argument:
    lines -- collection of Wikipedia articles
    numDocuments -- total number of documents
    
    Returns:
    index - the inverted index (implemented through a python dictionary) containing terms as keys and the corresponding 
    list of document these keys appears in (and the positions) as values.
    tf - normalized term frequency for each term in each document
    df - number of documents each term appear in
    idf - inverse document frequency of each term
    """
        
    index=defaultdict(list)
    tf=defaultdict(list) #term frequencies of terms in documents (documents in the same order as in the main index)
    df=defaultdict(int)         #document frequencies of terms in the corpus
    titleIndex=defaultdict(str)
    idf=defaultdict(float)
    
    for line in lines.values(): # Remember, lines contain all tweets, each line is a tweet
        #tweetID = line['tweetID']
        tweetID = line['tweetID']        
        terms = line['tokens']
        tweetIndex[tweetID]=line['text']           
        
        ## ===============================================================        
        ## create the index for the **current page** and store it in termdictPage
        ## termdictPage ==> { ‘term1’: [currentdoc, [list of positions]], ...,‘termn’: [currentdoc, [list of positions]]}
        
        ## Example: if the curr_doc has id 1 and his text is 
        ## "web retrieval information retrieval":
        
        ## termdictPage ==> { ‘web’: [1, [0]], ‘retrieval’: [1, [1,4]], ‘information’: [1, [2]]}
        
        ## the term ‘web’ appears in document 1 in positions 0, 
        ## the term ‘retrieval’ appears in document 1 in positions 1 and 4
        ## ===============================================================

        termdictPage={}

        for position, term in enumerate(terms): ## terms contains page_title + page_text
            try:
                # if the term is already in the dict append the position to the corrisponding list
                termdictPage[term][tweetID].append(position) 
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                termdictPage[term]=[tweetID, array('I',[position])] #'I' indicates unsigned int (int in python)
        
        #normalize term frequencies
        # Compute the denominator to normalize term frequencies (formula 2 above)
        # norm is the same for all terms of a document.
        norm=0
        for term, posting in termdictPage.items(): 
            # posting is a list containing doc_id and the list of positions for current term in current document: 
            # posting ==> [currentdoc, [list of positions]] 
            # you can use it to inferr the frequency of current term.
            norm+=len(posting[1])**2
        norm=math.sqrt(norm)


        #calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in termdictPage.items():     
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm,4))  ## SEE formula (1) above
            #increment the document frequency of current term (number of documents containing the current term)
            df[term]= len(termdictPage[term])  # increment df for current term
        
        #merge the current page index with the main index
        for termpage, postingpage in termdictPage.items():
            index[termpage].append(postingpage)
            
        # Compute idf following the formula (3) above. HINT: use np.log
    for term in df:
        idf[term] = np.round(np.log(float(numDocuments/df[term])),4)
            
    return index, tf, df, idf, tweetIndex


In [16]:
def createLikeRTIndex(lines, numDocuments):
    maxLikes = 0
    maxRT = 1
    for tweet in lines:
        if lines[tweet]['likes'] > maxLikes:
            maxLikes = lines[tweet]['likes']
        if lines[tweet]['rt_count'] > maxRT:
            maxRT = lines[tweet]['rt_count']
    for tweet in lines:
        lines[tweet]['likes_score'] = (-np.exp(-(lines[tweet]['likes']/50000))+1)
        lines[tweet]['rt_score'] = (-np.exp(-(lines[tweet]['rt_count']/25000))+1)

In [17]:
start_time = time.time()
numTweets = len(cleanTweets)
index, tf, df, idf, tweetIndex = create_index_tfidf(cleanTweets, numTweets)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time,2)))

Total time to create the index: 3.98 seconds


In [18]:
createLikeRTIndex(cleanTweets, numTweets)

#print(cleanTweets['1323630540977852416']['likes'])
#print(cleanTweets['1323630540977852416']['likes_score'])

In [19]:
def saveDictToCSV(d, filename):
    w = csv.writer(open(filename, "w"))
    for key, val in d.items():
        w.writerow([key, val])
        
def loadDictFromCSV(filename):
    tmpDict={}
    with open(filename,'r') as data: 
        for line in csv.reader(data): 
            tmpDict[line[0]]=line[1]
    return tmpDict

In [20]:
def rankDocuments(terms, docs, index, idf, tf, titleIndex):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    titleIndex -- mapping between page id and page title
    
    Returns:
    Print the list of ranked documents
    """
        
    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaing elements would became 0 when multiplied to the queryVector
    docVectors=defaultdict(lambda: [0]*len(terms)) # I call docVectors[k] for a nonexistent key k, the key-value pair (k,[0]*len(terms)) will be automatically added to the dictionary 
    queryVector=[0]*len(terms)    

    # compute the norm for the query tf
    query_terms_count = collections.Counter(terms) # get the frequency of each term in the query. 
    # Example: collections.Counter(["hello","hello","world"]) --> Counter({'hello': 2, 'world': 1})
    # HINT: use when computing tf for queryVector
    
    query_norm = la.norm(list(query_terms_count.values()))
    
    for termIndex, term in enumerate(terms): #termIndex is the index of the term in the query
        if term not in index:
            continue
                    
        ## Compute tf*idf(normalize tf as done with documents)
        queryVector[termIndex]=query_terms_count[term]/query_norm * idf[term]

        # Generate docVectors for matching docs
        for docIndex, (doc, postings) in enumerate(index[term]):
            # Example of [docIndex, (doc, postings)]
            # 0 (26, array('I', [1, 4, 12, 15, 22, 28, 32, 43, 51, 68, 333, 337]))
            # 1 (33, array('I', [26, 33, 57, 71, 87, 104, 109]))
            # term is in doc 26 in positions 1,4, .....
            # term is in doc 33 in positions 26,33, .....
            
            #tf[term][0] will contain the tf of the term "term" in the doc 26            
            if doc in docs:
                docVectors[doc][termIndex]=tf[term][docIndex] * idf[term]  # TODO: check if multiply for idf

    # calculate the score of each tweet
    # compute the cosine similarity between queyVector and each docVector:
    # HINT: you can use the dot product because in case of normalized vectors it corresponds to the cosine siilarity
    # see np.dot
    
    docScores=[ [(np.dot(curDocVec, queryVector)/(np.linalg.norm(curDocVec)*np.linalg.norm(queryVector)))*0.6+cleanTweets[doc]['likes_score']*0.2+cleanTweets[doc]['rt_score']*0.2, doc] for doc, curDocVec in docVectors.items() ]
    docScores.sort(reverse=True)
    resultDocs=[(x[0], x[1]) for x in docScores]
    #print document titles instead if document id's
    #resultDocs=[ titleIndex[x] for x in resultDocs ]
    if len(resultDocs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)    
    #print ('\n'.join(resultDocs), '\n')
    return resultDocs

In [21]:
def search_tf_idf(query, index):
    '''
    output is the list of documents that contain any of the query terms. 
    So, we will get the list of documents for each query term, and take the union of them.
    '''
    query=cleanTweet(query)
    docs=set()
    for term in query:
        try:
            # store in termDocs the ids of the docs that contain "term"                        
            termDocs=[posting[0] for posting in index[term]]
            
            # docs = docs Union termDocs
            docs = docs.union(termDocs)
        except:
            #term is not in index
            pass
    docs=list(docs)
    ranked_docs = rankDocuments(query, docs, index, idf, tf, tweetIndex)   
    return ranked_docs

In [26]:
print("Insert your query:\n")
query = input()
ranked_docs = search_tf_idf(query, index)    
top = 20

print("\n======================\nTop {} results out of {} for the seached query:\n".format(top, len(ranked_docs)))
for d_score, d_id in ranked_docs[:top] :
    print("Tweet_id= {}\nTweet: {}\nLikes: {}\nRT: {}\nScore: {}\n\n".format(d_id, tweetIndex[d_id], cleanTweets[d_id]['likes'], cleanTweets[d_id]['rt_count'], d_score))

Insert your query:

biden

Top 20 results out of 28070 for the seached query:

Tweet_id= 932685522820042754
Tweet: ME:  Joe, about halfway through the speech, I’m gonna wish you a happy birth--
BIDEN:  IT’S MY BIRTHDAY!
ME:  Joe.… https://t.co/5qLUsDoaMi
Likes: 1851601
RT: 441623
Score: 0.9999999957415385


Tweet_id= 1307491919384260609
Tweet: I’m Joe Biden and I approve this message. https://t.co/TuRZXPE5xK
Likes: 972457
RT: 269939
Score: 0.9999959094170903


Tweet_id= 1325121452719235073
Tweet: Congratulations to President-elect Joe Biden &amp; Vice President-elect Kamala Harris!
Likes: 1132290
RT: 120952
Score: 0.9984155499349555


Tweet_id= 1324846580147642369
Tweet: Joe Biden should not wrongfully claim the office of the President. I could make that claim also. Legal proceedings are just now beginning!
Likes: 673411
RT: 115592
Score: 0.9980363960733912


Tweet_id= 1329233502139715586
Tweet: Look at this in Wisconsin! A day AFTER the election, Biden receives a dump of 143,379 votes

In [22]:
#print(tf)
#print(dict(sorted(tf.items(), reverse=True, key=lambda item: item[1])))
print(dict(sorted(idf.items(), reverse=True, key=lambda item: item[1])))
# LOCURA (-e^-(x/50000)+1)



In [23]:
def get_tokens(docs, preprocess=preprocess_string, verbose=10000):
    
    for i, doc in enumerate(docs):
        yield preprocess(doc) # preprocess
        
        # print progress if needed
        if verbose > 0 and (i + 1) % verbose == 0:
            print(f"Progress: {i + 1}")
            
def get_sentences(docs, verbose=10000):
    #loop over all docs (tweets in our case)
    for i, doc in enumerate(docs):
        
        # use nltk.sent_tokenize to split paragraphs into sentences
        for sentence in nltk.sent_tokenize(doc):
            # preprocess each sentence using gensim (return string not list)
            yield " ".join(preprocess_string(sentence))
            
        # print progress if needed
        if verbose > 0 and (i + 1) % verbose == 0:
            print(f"Progress: {i + 1}")


In [24]:
# Get all tweet TEXTS
tweets, tweetsID = [], []
for t in cleanTweets:
    tweets.append(cleanTweets[t]['text']) #add tweet to list of all tweets
    tweetsID.append(cleanTweets[t]['tweetID']) #add tweet id to list of all tweet ids
    
tokens = []
for t in cleanTweets:
    tokens.append(cleanTweets[t]['tokens'])
    
sentences = list(get_sentences(tweets))
print(sentences)

Progress: 10000
Progress: 20000
Progress: 30000


In [25]:
#split each sentence into a list od words
words = [s.split() for s in sentences ]
for x in words:
    if "netflix" in x:
        print(x)
    

['ibm', 'netflix', 'eli', 'lilli', 'gannett', 'amazon', 'chevron', 'steel', 'whirlpool', 'goodyear', 'salesforc', 'halliburton…', 'http', 'tsqpaftxb']
['hous', 'approv', 'ban', 'big', 'cat', 'ownership', 'netflix', 'tiger', 'king', 'http', 'vxwcpfytt', 'gop', 'gopleader…', 'http', 'ntjwnizj']
['turn', 'netflix', 'nfl', 'fox', 'new', 'tell', 'joe', 'biden', 'presid', 'elect']
['rememb', 'netflix', 'johnni', 'depp', 'plai', 'trump', 'art', 'deal', 'movi', 'vanish']
['watch', 'dirti', 'monei', 'netflix']
['hit', 'netflix', 'seri', 'haha']
['english', 'quran', 'covid', 'coronaviru', 'trump', 'trump', 'biden', 'joebiden', 'usa', 'nyc', 'nevada', 'netflix', 'election…', 'http', 'tlgndgzv']


In [26]:
#create a word2Vec model  
MODEL_DIMENSION = 200
w2v_model = Word2Vec(sentences = words, size=MODEL_DIMENSION, window=10, min_count=1, negative=15, sg = 1)

In [27]:
query = 'biden'
w2v_model.most_similar(query)

  


[('sleepi', 0.8119498491287231),
 ('vice', 0.8066702485084534),
 ('wontcavedav', 0.8023542165756226),
 ('harris’', 0.8003657460212708),
 ('certain', 0.7995415925979614),
 ('greater', 0.7971093654632568),
 ('child', 0.7961598038673401),
 ('we’v', 0.7957644462585449),
 ('product', 0.7955340147018433),
 ('for…', 0.7912787795066833)]

In [28]:
def expand_query(query, wv, topn=10):
    
    query = preprocess_string(query)
    expanded_query = [t for t in query] # initialize with original query. Note, it is a list
    
    # extend each single term of the original query and append to expanded query
    for t in query:
        expanded_query.extend(s for s, f in wv.most_similar(t, topn=topn))
        
    return expanded_query

In [29]:
def get_top_n(bm25, query, n=100):
    #apply preprocessing to the query using get_tokens and tranform it from string to list of terms
    query = query.split() # cast query from string to list
    query = list(get_tokens(query)) # apply preprocessing
    
    query = [item for sublist in query for item in sublist] # transform list of list to list
    
    # score docs using a specific function of bm25        
    scores = np.array(bm25.get_scores(query))
    #n=len(scores)
        
    # get indices of top N scores
    idx = np.argpartition(scores, -n)[-n:] # INDEX DELS TOP 20
    
    # sort top N scores and return their indices
    # if all the scores are 0 return empty list
    if np.sum(scores[idx]) == 0: 
        return[] 
    return idx[np.argsort(-scores[idx])], scores[idx[np.argsort(-scores[idx])]]

In [74]:
# Play with some queries - insert also query with typos
bm25 = BM25(tokens) # constructing a paragraph search index
expanded_query = ''
query = input()
try:
    expanded_query = ' '.join(expand_query(query, w2v_model))
except Exception as e:
    print("asdasd")
    print(e)

top_idx = None
print('Original query: {}'.format(query))
print('Expanded query: {}'.format(expanded_query))
print('---')
try:
    indexTmp, score = get_top_n(bm25, expanded_query, n=100)
    data = {'id':  [], 'text': [], 'scoreW2V': [], 'score2': []}
    df = pd.DataFrame (data, columns = ['id','text', 'scoreW2V', 'score2', 'totalScore'])
    print("estamoss1")
    for i in zip(indexTmp,score):
        
        idx=i[0]
        score=i[1]
        df = df.append({'id': tweetsID[idx], 'text': tweets[idx], 'scoreW2V': score}, ignore_index=True)
    print("estamoss2")    
    ranked_docs = search_tf_idf(query, index) 
    print("estamoss3")
    for score, idx in ranked_docs:
        df['score2'][df['id'] == idx] = score  
    print("estamoss4")    
    df = df.fillna(0)
    df['totalScore'] = df['scoreW2V'] + df['score2']
    df = df.sort_values(by=['totalScore'], ascending=False)
    dfList=df.values.tolist()
    for x in dfList[:20]:
        print('\nTweet ID: {}'.format(x[0]))
        print('\nTweet: {}'.format(x[1]))
        print('\nScore: {}\n'.format(x[4]))
        print('*****************************')
except:
    print("No matching documents found")

biden


  


Original query: biden
Expanded query: biden sit for… sleepi certain vice husband creepi loud harris’ greater
---
estamoss1
estamoss2
estamoss3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


estamoss4

Tweet ID: 1334891637466652676

Tweet: @realDonaldTrump America will be greater on Jan 20th with President Biden and Vice President Harris.

Score: 17.3595018429347

*****************************

Tweet ID: 1334684201203822594

Tweet: In an interview with @Jaketapper, Vice President-elect Kamala Harris says that her husband @DouglasEmhoff will be c… https://t.co/lGkBy7AgF3

Score: 14.006788823264747

*****************************

Tweet ID: 1334605856156110849

Tweet: Vice President-elect Kamala Harris’s top aides are almost all women of color. The appointments are part of the camp… https://t.co/jO8PUvGBws

Score: 13.672222950875057

*****************************

Tweet ID: 1334886303796588547

Tweet: sleepy like biden 😴

Score: 12.944947057939096

*****************************

Tweet ID: 1334890214934843392

Tweet: Vice president-elect Kamala Harris' husband, Doug Emhoff, will be the first man to serve as the spouse of the Vice… https://t.co/qWO1tLgPqL

Score: 12.90201814084

In [30]:
def emb_vector(tweet, model=w2v_model):
    query_vec=[]
    i=0
    ## AQUI TENIEN EL TWEET EN TOKENS
    for word in tweet:
        if word in w2v_model.wv.vocab:
            query_vec.append(w2v_model.wv.word_vec(word))
            i+=1

    if i!=0:
        vec=np.average(np.array(query_vec), axis=0)
        return vec/np.linalg.norm(vec)

    #print(tweet)
    return np.zeros((MODEL_DIMENSION,))

In [33]:
embedded_tweets=[]
[embedded_tweets.append((emb_vector(cleanTweets[tweetID]['tokens']),tweetID)) for tweetID in cleanTweets]

print(embedded_tweets)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
from sklearn.manifold import TSNE

fin=[vec[0] for vec in embedded_tweets]
print(len(fin))
print(len(cleanTweets))

low_dimension_embedded = TSNE(n_components=2, perplexity=2, random_state=33).fit_transform(fin)

38592
38592


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from matplotlib.pyplot import cm
NUM_CLUSTERS=3

colors_list=cm.rainbow(np.linspace(0,1,NUM_CLUSTERS))

fin=[vec[0] for vec in embedded_tweets]
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0).fit(fin)
print(kmeans.labels_)


plt.figure(figsize=(10,10))
plt.title('T-SNE Representation of Word2Vec')

for i in range(0,NUM_CLUSTERS):
    
    sns.scatterplot(x=low_dimension_embedded[kmeans.labels_==i,0],y=low_dimension_embedded[kmeans.labels_==i,1],color=colors_list[i][0:3],label="cluster "+str(i))


#sns.scatterplot(x=low_dimension_embedded[:,0], y=low_dimension_embedded[:,1], legend='full',  c=kmeans.labels_)

#sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y, legend='full', palette=palette)

plt.legend()
plt.show()

np.save('tsne_output_acabat.npy', low_dimension_embedded)
#low_dim_embedded=np.load('tsne_output_acabat.npy')











In [31]:
low_dimension_embedded=np.load('tsne_output_acabat.npy')

In [None]:

for i,tweetID in enumerate(cleanTweets):
    
    cleanTweets[tweetID]["cluster"]=kmeans.labels_[i]
    



#print(embedded_tweets)

In [37]:
print(len(kmeans.labels_))
print(len(cleanTweets))

38592
38592


In [53]:
print(type(kmeans))

<class 'sklearn.cluster._kmeans.KMeans'>


In [54]:
print(low_dimension_embedded)

[[-71.03327    58.988716 ]
 [ 18.181177   21.993793 ]
 [  5.57139    -6.8656173]
 ...
 [-28.33046   -27.669258 ]
 [ 26.001335   -2.5038085]
 [ 76.628     -57.878666 ]]


In [62]:
for i,tweetID in enumerate(cleanTweets):
    if cleanTweets[tweetID]["cluster"]==1:
        print(cleanTweets[tweetID]["text"])

FUCK OFF.
@JennaEllisEsq 
Here's my take honey:
Trump DID NOT win Tx, Fla or Ohio. Those elections were rigged by Cruz, Corny… https://t.co/TpfdZIYMYv
It’s Friday, December 4, 2020 and Donald J. Trump is still your President. Get used to it for FOUR MORE YEARS!
@realDonaldTrump We r by electing Biden. We r done with ur lying racist ass. And let’s not forget u r a rapist as w… https://t.co/6OiDaLhy5Q
@ENBrown @WalshFreedom just because she agrees with Donald Trump on somethings? I agree with Trump on a few things,… https://t.co/5bIYgH3kZ7
Romney blasts Trump's lack of leadership during pandemic: "It's a great human tragedy" https://t.co/5F1aOgE4tE https://t.co/w89ga987uB
Not a word from trump on this. Just tweets about his selfish self.
@realDonaldTrump so much for your “Dominion” argument.
“Why aren’t more Republicans standing up and speaking out about Donald Trump’s reprehensible acts?,” Carpenter writ… https://t.co/02BydWPGXb
Parler is a hub for the grifters and propagandists who fee

@realDonaldTrump Yep and Biden will have to do so
@Jordan_Morrow12 @BU2BaRUSHfan @thecjpearson @realDonaldTrump Again I know it’s hard for people like you to underst… https://t.co/NvvhJJKvvF
@realDonaldTrump Trump won't be able to rig this election in his favor... Trump lost.
@Dom_subInfo1 @delgado2485 @nobodysghost @GCharles_ @FoxNews I was born and raised here, and I agree. Trump deserve… https://t.co/0Y5GieJAmX
@DonaldJTrumpJr Donald Trump Jr &amp; his corrupt old dad stole from cancer kids. #TrumpCrimeFamily https://t.co/R4s22WDIm1
FURIOUS Trump Considers Firing William Barr https://t.co/iDTQAo7MPI via @YouTube
@mattgaetz Trump:  Impeached &amp; Defeated!
Bethany’s trump loving fox news watching republican father started talking to me about voter fraud and how trump wo… https://t.co/uABBV0jwHH
Let's see if they have the cajones https://t.co/auXp9DYSE9
@RallyPoint7 @wjxt4 Trump lost.
@GovRonDeSantis @NASPCOLA Another attack under the Trump watch? Impossible this is fake news. Trump 

@politicalwire @rickhasen It’s criminal. Let’s call it what it is and let’s remind everyone that scumbag Trump has… https://t.co/3n0j2J9w9p
@FenechEd @saneman3000 @robert_kroupa @JBills5 @BNebraskan @DanCrenshawTX @michellemalkin @GOP @C_C_Krebs The Revol… https://t.co/joBQ15lLLJ
@happinasa @needyjuliette @realDonaldTrump i love seeing TRUMP lose, it’s my daily medicine, my weekly energy, my m… https://t.co/apoqwVnp7L
@sendavidperdue if you do NOT Stand up for @realDonaldTrump then we the people of GA are not going to vote!  If you… https://t.co/kgo6bqRP2G
@JBryanEure @JoeDanMedia @GOP @realDonaldTrump If you're an incumbent GOP office holder - You better figure out how… https://t.co/YYcLZnKW35
@realcooldadxvx Dude, their address is an apartment in Trump Palace in Miami. https://t.co/eb08NNNeXY
defending fraud because you hate trump
If I see any of you buying and wearing this shit, you’re getting bullied. The Election is over. This shit here is s… https://t.co/Aeu79jSn1z
The Soon to be

@jimmyofflagstaf @realDonaldTrump Biden will never be president. GITMO has a nice 8×12 waiting for him. https://t.co/Fq9hYCb4Fw
What I Learned From My Brush With Trump https://t.co/a59GMEgzfs
@cjtruth @TrumpWarRoom I guess they underrated Trump's dossier on the fraud and the fact that majority of "The Peop… https://t.co/w7LJa5VsRW
@realDonaldTrump We can't loose the Senate. It would not end well regardless if President @realDonaldTrump is right… https://t.co/KVpM7RHKQL
@benshapiro @DanCrenshawTX .@benshapiro is another Deep State pizza lover turning on Trump.
@anmas71 @IvankaTrump .
.
For the first time in history, Americans will receive their best Christmas present and ce… https://t.co/1qpgUB6tPX
CNN is now publishing articles threatening the Trump kids. ⁦@KateBennett_DC⁩ is a sicko. https://t.co/tz3u3Qoo2z
No. Stop it.
@RightWingWatch ...$15,000 was spent to buy discreet I.P. addresses through which votes for Mr. Trump could be cast… https://t.co/rnr3CZ0Igl
Go get 'em Rudy &amp; frie

In [86]:
print("Insert your query:\n")
query = input()
ranked_docs = search_tf_idf(query, index)    
top = 20



print("\n======================\nTop {} results out of {} for the seached query:\n".format(top, len(ranked_docs)))
for d_score, d_id in ranked_docs[:top] :
    print("Tweet_id= {}\nTweet: {}\nLikes: {}\nRT: {}\nScore: {}\nCluster: {}\n\n".format(d_id, tweetIndex[d_id], cleanTweets[d_id]['likes'], cleanTweets[d_id]['rt_count'], d_score,cleanTweets[d_id]['cluster']))

Insert your query:

trump gop

Top 20 results out of 16026 for the seached query:

Tweet_id= 1333600406769983489
Tweet: LOL. The GOP thinks Trump is just going to go away? He's the most popular Republican in history.
Likes: 97678
RT: 14622
Score: 0.8602117610175973
Cluster: 1


Tweet_id= 1326926226888544256
Tweet: “REPORT: DOMINION DELETED 2.7 MILLION TRUMP VOTES NATIONWIDE. DATA ANALYSIS FINDS 221,000 PENNSYLVANIA VOTES SWITCH… https://t.co/PgrmoBcJib
Likes: 617608
RT: 185639
Score: 0.8241440392056993
Cluster: 2


Tweet_id= 1324158992877154310
Tweet: Today, the Trump Administration officially left the Paris Climate Agreement. And in exactly 77 days, a Biden Admini… https://t.co/7dT22UeN7h
Likes: 774809
RT: 94285
Score: 0.8196600688662569
Cluster: 0


Tweet_id= 1334240039639937026
Tweet: Statement by Donald J. Trump, The President of the United States

Full Video: https://t.co/EHqzsLbbJG https://t.co/Eu4IsLNsKD
Likes: 309015
RT: 91235
Score: 0.8186487617209675
Cluster: 2


Tweet_id= 13

In [None]:
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(fin)
    Sum_of_squared_distances.append(km.inertia)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [69]:
centroids = kmeans.cluster_centers_
print(centroids.shape)

(3, 1000)
