**Project delivery 1**

Part 1: Text Processing and Exploratory Data Analysis

In [1]:
import nltk

In [2]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
from numpy import linalg as la
import time
import json
import re
import pandas as pd
from sklearn.manifold import TSNE

****
**0. LOAD AND STRUCTURE**

For each tweet we have the following features:

- docID
- Tweet 
- ID 
- Date 
- Hashtags 
- Likes 
- Retweets 
- Url

In [3]:
# LOAD

# Open the file: Load data into memory
docs_path = './IRWA_data_2023/Rus_Ukr_war_data.json'
with open(docs_path) as fp:
    # Read the file
    lines = fp.readlines()

data = []
for i in lines:
    data.append(json.loads(i))
    
# STRUCTURE   

# docID | Tweet | ID | Date | Hashtags | Likes | Retweets | Url
# Create a list of dictionaries
tweet_dicts = []

for i in range(len(data)):
    doc_id = 'doc_' + str(i+1)
    tweet_text = data[i]['full_text']
    tweet_id = data[i]['id']
    tweet_date = data[i]['created_at']
    hashtags = [hashtag['text'] for hashtag in data[i]['entities']['hashtags']]
    likes = data[i]['favorite_count']
    retweets = data[i]['retweet_count']
    url = 'https://twitter.com/' + str(data[i]['user']['screen_name'])+'/status/'+str(tweet_id)

    # Create a dictionary for each tweet
    tweet_dict = {'DOC_ID': doc_id, 'Tweet Text': tweet_text, 'Tweet ID': tweet_id, 'Tweet Date': tweet_date, 'Hashtags': hashtags,
                  'Likes': likes, 'Retweets': retweets, 'URL': url}

    # Append the dictionary to the list
    tweet_dicts.append(tweet_dict)

print(tweet_dicts[400])

{'DOC_ID': 'doc_401', 'Tweet Text': 'Morning little buddy Zhang @ChinaAmbUN #JinPing #CCP little man #Putin has gone off the rails for everyone to see.  World has spoken my friend. You are backing a loser. Do the right thing before its too late to turn back the clock. #UkraineRussiaWar  @UKRinUN @USUN @RussiaUN https://t.co/kMwyzEHSvt', 'Tweet ID': 1575863777265942528, 'Tweet Date': 'Fri Sep 30 15:02:57 +0000 2022', 'Hashtags': ['JinPing', 'CCP', 'Putin', 'UkraineRussiaWar'], 'Likes': 1, 'Retweets': 0, 'URL': 'https://twitter.com/onestandard4all/status/1575863777265942528'}


****
**1. PRE-PROCESS THE DOCUMENTS**

In [4]:

def build_terms(line):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    # Tweet Text and Tweet date to lowercase and tokenized
    line['Tweet Text']=  line['Tweet Text'].lower().split()
    line['Tweet Date']=  line['Tweet Date'].lower().split()
    
    # HASHTAGS
    # To lowercase
    line['Hashtags'] = [word.lower() for word in line['Hashtags']]
    
    line['Tweet Text'] = [word for word in line['Tweet Text'] if word[0] != '#']
    #EXTRA-- DELETE SYMBOLS 
    # Decided to delete the hashtags symbols as we have the list of hashtags in the dictionary
    line['Tweet Text'] = [re.sub(r"[^\w\s']+", '', word) for word in line['Tweet Text']]
    
    # DELETE PUNCTUATION
    # EXTRA IF word is a link NOT TO DELETE THE punctuation SYMBOLS
    line['Tweet Text'] = [re.sub(r"[,.;@#?!&$]+", '', word) for word in line['Tweet Text'] if not word.startswith('https')]
    
    # EXTRA-- DELETE EMPTY STRINGS "" 
    line['Tweet Text']= [word for word in line['Tweet Text'] if word != '']
    
    #DELETE STOPWORDS 
    line['Tweet Text']= [word for word in line['Tweet Text'] if word not in stop_words]
    
    # DEALING WITH STEMMING
    line['Tweet Text']= [stemmer.stem(word) for word in line['Tweet Text']]
    
    return line

# print a tweet
print(tweet_dicts[400])

{'DOC_ID': 'doc_401', 'Tweet Text': 'Morning little buddy Zhang @ChinaAmbUN #JinPing #CCP little man #Putin has gone off the rails for everyone to see.  World has spoken my friend. You are backing a loser. Do the right thing before its too late to turn back the clock. #UkraineRussiaWar  @UKRinUN @USUN @RussiaUN https://t.co/kMwyzEHSvt', 'Tweet ID': 1575863777265942528, 'Tweet Date': 'Fri Sep 30 15:02:57 +0000 2022', 'Hashtags': ['JinPing', 'CCP', 'Putin', 'UkraineRussiaWar'], 'Likes': 1, 'Retweets': 0, 'URL': 'https://twitter.com/onestandard4all/status/1575863777265942528'}


In [5]:
#PROCESS ALL THE TWEETS
for tweet in tweet_dicts:
    tweet = build_terms(tweet)


In [6]:
print(tweet_dicts[2])

{'DOC_ID': 'doc_3', 'Tweet Text': ['alert', 'poland', 'prep', 'antiradi', 'tablet', 'nuclear', 'threat'], 'Tweet ID': 1575917992390823936, 'Tweet Date': ['fri', 'sep', '30', '18:38:23', '+0000', '2022'], 'Hashtags': ['nato', 'putin', 'russia', 'russiainvadedukraine', 'ukraine', 'ukrainerussiawar'], 'Likes': 0, 'Retweets': 0, 'URL': 'https://twitter.com/NEWS_ALL_TIME/status/1575917992390823936'}


****
**PROJECT PART 2**

1. Build inverted index

In [7]:

def create_index(lines): 
    index = defaultdict(list)
    # Create an index for each term in the tweet text
    for i, line in enumerate(lines):
        for term in line['Tweet Text']:
            # show a list in which doc_id it appears
            if i not in index[term]:
                index[term].append(i)
                      
    return index

In [8]:

index = create_index(tweet_dicts)
#index

2. Propose test queries

In [9]:
# build terms for the query 
def build_terms_query(query):
    # To lowercase and tokenized
    query = query.lower().split()
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    #DELETE STOPWORDS
    query = [word for word in query if word not in stop_words]
    # DEALING WITH STEMMING
    query = [stemmer.stem(word) for word in query]
    return query


The difference between search_union and search_intersection is that the first one returns the documents that contain at least one of the words in the query, while the second one returns the documents that contain all the words in the query. We are going to use both for a further analysis: 

In [10]:
def search_intersection(query, index):
    "term_docs contains (for each word) the documents that contain each word in the query, "
    "then we should intersect these lists in order to obtain the documents where all "
    "the words in the query are found"
    
    query = build_terms_query(query)
    docs = None  
    for term in query:
        try:
            term_docs = set(index[term]) 
            #print(term, len(term_docs))
            if docs is None:
                docs = term_docs
            else:
                docs = docs.intersection(term_docs)
        except KeyError:
            # Term is not in index
            pass

    if docs is not None:
        docs = list(docs)
    else:
        docs = []  # No matching documents found

    return docs

To do the select the queries we are going to do in a better way we are going to see the most frequent words in the corpus and then we are going to select the queries.

In [11]:
print('Top 10 most used words:')
df = pd.DataFrame(tweet_dicts)
# terms most used by frequency 
all_words = []
for line in df['Tweet Text']:
    all_words.extend(line)
# Create a dictionary with the frequency of each word
word_freq = collections.Counter(all_words)
word_freq.most_common(10)

Top 10 most used words:


[('ukrain', 945),
 ('russian', 884),
 ('russia', 556),
 ('ukrainian', 466),
 ('war', 444),
 ('putin', 443),
 ('forc', 273),
 ('region', 250),
 ('annex', 226),
 ('amp', 222)]

Selection of queries for search_intersection:

In [12]:
# 1. "Ukraine and Russia"
query = "Ukraine and Russia"
docs = search_intersection(query, index)
print(query, len(docs))
#print(docs)
print("*************")

# 2. "Ukraine Russia War"

query = "Ukraine Russia War"
docs = search_intersection(query, index)
print(query, len(docs))
#print(docs)
print("*************")

# 3. "World War"
query = "World war"
docs = search_intersection(query, index)
print(query, len(docs))
#print(docs)
print("*************")

# 4. "putin war"
query = "putin war"
docs = search_intersection(query, index)
print(query, len(docs))
#print(docs)
print("*************")

# 5. "Annex Region"
query = "Annex Region"
docs = search_intersection(query, index)
print(query, len(docs))
#print(docs)
print("*************")

# 6. "Putin world war"
query = "Putin world war"
docs = search_intersection(query, index)
print(query, len(docs))
#print(docs)
print("*************")


Ukraine and Russia 180
*************
Ukraine Russia War 50
*************
World war 21
*************
putin war 48
*************
Annex Region 60
*************
Putin world war 3
*************


3. Rank your results

In [13]:
def create_index_tfidf(lines):
    index = create_index(lines)
    tf = defaultdict(list)  # term frequencies of terms in documents (documents in the same order as in the main index)
    df = defaultdict(int)  # document frequencies of terms in the corpus
    idf = defaultdict(float)

    # Total number of documents in the corpus
    num_documents = len(lines)

    # Iterate through each document
    for i, line in enumerate(lines):

        # Extract the terms from the document
        terms = line['Tweet Text']

        term_count = defaultdict(int)

        for term in terms:
            term_count[term] += 1

        # Normalize term frequencies in the document
        norm = math.sqrt(sum(count ** 2 for count in term_count.values()))

        for term, count in term_count.items():
            tf_value = np.round(count / norm, 4)  # Calculate TF value for the term
            tf[term].append(tf_value)
            df[term] += 1

    # Calculate IDF for each term
    for term in df:
        idf[term] = np.round(np.log(float(num_documents / df[term])),4)

    return index, tf, df, idf



In [14]:
# Example usage:
start_time = time.time()
total_index, tf, df, idf = create_index_tfidf(tweet_dicts)
print("Total time to create the index: {} seconds" .format(np.round(time.time() - start_time, 2)))

Total time to create the index: 0.5 seconds


In [15]:
total_index

defaultdict(list,
            {'melsimmonsfcdo': [0],
             'wrong': [0,
              24,
              255,
              326,
              444,
              1321,
              1497,
              1511,
              1595,
              2092,
              2156,
              2206,
              2235,
              2236,
              3713,
              3843,
              3899,
              3946,
              3993],
             'dictat': [0, 198, 224, 239, 665, 1861, 2673, 2962, 3114],
             "putin'": [0,
              55,
              98,
              130,
              132,
              135,
              182,
              190,
              245,
              304,
              332,
              414,
              515,
              540,
              576,
              601,
              626,
              676,
              741,
              766,
              809,
              1185,
              1388,
              1477,
              1571,
       

Term Frequency measures the frequency of a term (word or phrase) in a specific document.

In [16]:
tf

defaultdict(list,
            {'melsimmonsfcdo': [0.2887],
             'wrong': [0.2887,
              0.2673,
              0.8165,
              0.2357,
              0.189,
              0.2294,
              0.2357,
              0.3015,
              0.2357,
              0.2,
              0.25,
              0.2,
              0.2357,
              0.2357,
              0.1925,
              0.3015,
              0.2357,
              0.2236,
              0.2774],
             'dictat': [0.2887,
              0.5,
              0.5774,
              0.2357,
              0.2132,
              0.2582,
              0.2425,
              0.2132,
              0.5774],
             "putin'": [0.2887,
              0.3162,
              0.3333,
              0.25,
              0.3536,
              0.25,
              0.1925,
              0.2236,
              0.7071,
              0.3536,
              0.25,
              0.2582,
              0.4472,
              0.2132,
    

Document Frequency measures how many documents in a corpus contain a specific term (word or phrase).

In [17]:
df

defaultdict(int,
            {'melsimmonsfcdo': 1,
             'wrong': 19,
             'dictat': 9,
             "putin'": 63,
             'fascist': 9,
             'russia': 502,
             'intend': 5,
             'conquer': 2,
             'much': 28,
             'ukrainian': 435,
             'land': 63,
             'possibl': 36,
             'arm': 105,
             'forc': 256,
             'liber': 44,
             'villag': 14,
             'urban': 2,
             'territori': 165,
             'commun': 18,
             'regionukrainerussiawar': 1,
             'alert': 11,
             'poland': 17,
             'prep': 1,
             'antiradi': 1,
             'tablet': 1,
             'nuclear': 99,
             'threat': 39,
             'im': 14,
             'still': 35,
             'wait': 19,
             'googl': 2,
             'map': 26,
             'updat': 39,
             'new': 149,
             'annex': 215,
             'take': 71,
            

Inverse Document Frequency measures the importance of a term across a collection of documents (corpus).

In [18]:
idf

defaultdict(float,
            {'melsimmonsfcdo': 8.294,
             'wrong': 5.3496,
             'dictat': 6.0968,
             "putin'": 4.1509,
             'fascist': 6.0968,
             'russia': 2.0754,
             'intend': 6.6846,
             'conquer': 7.6009,
             'much': 4.9618,
             'ukrainian': 2.2187,
             'land': 4.1509,
             'possibl': 4.7105,
             'arm': 3.6401,
             'forc': 2.7489,
             'liber': 4.5099,
             'villag': 5.655,
             'urban': 7.6009,
             'territori': 3.1881,
             'commun': 5.4037,
             'regionukrainerussiawar': 8.294,
             'alert': 5.8962,
             'poland': 5.4608,
             'prep': 8.294,
             'antiradi': 8.294,
             'tablet': 8.294,
             'nuclear': 3.6989,
             'threat': 4.6305,
             'im': 5.655,
             'still': 4.7387,
             'wait': 5.3496,
             'googl': 7.6009,
             '

Ranking the documents 

In [19]:
def rank_documents(terms, docs, index, idf, tf):
    doc_vectors = defaultdict(lambda: [0] * len(terms))
    query_vector = [0] * len(terms)

    query_terms_count = collections.Counter(terms)
    
    query_norm = la.norm(list(query_terms_count.values()))
    
    
    for termIndex, term in enumerate(terms):
        if term not in index:
            continue

        query_vector[termIndex] = query_terms_count[term] / query_norm * idf[term]

        for doc_index, doc in enumerate(index[term]):
            if doc in docs:
                doc_vectors[doc][termIndex] = tf[term][doc_index] *idf[term]

    doc_scores = [[np.dot(curDocVec, query_vector), doc] for doc, curDocVec in doc_vectors.items()]
    doc_scores.sort(reverse=True)
    
    result_docs = [x[1] for x in doc_scores]
    if len(result_docs) == 0:
        print("No results found, try again")
        query = input()
        docs = search_tf_idf(query, index)
    return result_docs, doc_vectors

In [20]:
def search_tf_idf(query, index):
    query = build_terms_query(query) 
    docs = None
    for term in query:
        try:
            term_docs = set(index[term]) 
            #print(term, len(term_docs))
            if docs is None:
                docs = term_docs
               
            else:
                docs = docs.intersection(term_docs)
                
        except KeyError:
            # Term is not in index
            pass

    docs = list(docs)
   
    ranked_docs, doc_vectors = rank_documents(query, docs, index, idf, tf)
    return ranked_docs, doc_vectors


In [21]:
# Then, perform a query and rank the documents
query = 'putin'

ranked_docs, doc_vectors = search_tf_idf(query, total_index)

# Display the top-ranked documents with their tweet text
top = 10
print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for doc_id in ranked_docs[:top]:
    print("tweet_id= {} - Text: {}".format(doc_id, tweet_dicts[int(doc_id)]['Tweet Text']))


Sample of 10 results out of 414 for the searched query:

tweet_id= 1070 - Text: ['lot', 'blood', 'putin', 'hand', 'putin', 'destroy', 'russia', 'putin', 'destroy', 'ukrain', 'putin', 'lost', 'war']
tweet_id= 3046 - Text: ['would', 'putin']
tweet_id= 736 - Text: ['putin', 'arriv']
tweet_id= 3563 - Text: ['putin', 'mysteri', 'limp', 'russia', 'presid', 'realli', 'die', 'putin', 'health', 'analys', 'vladimir', 'putin', 'sick']
tweet_id= 2593 - Text: ['putin', 'mysteri', 'limp', 'russia', 'presid', 'realli', 'die', 'putin', 'health', 'analys', 'vladimir', 'putin', 'sick']
tweet_id= 1735 - Text: ['putin', 'mysteri', 'limp', 'russia', 'presid', 'realli', 'die', 'putin', 'health', 'analys', 'vladimir', 'putin', 'sick']
tweet_id= 1619 - Text: ['putin', 'mysteri', 'limp', 'russia', 'presid', 'realli', 'die', 'putin', 'health', 'analys', 'vladimir', 'putin', 'sick']
tweet_id= 765 - Text: ['putin', 'mysteri', 'limp', 'russia', 'presid', 'realli', 'die', 'putin', 'health', 'analys', 'vladimir', '

**Evaluation**

In [22]:
search_results = pd.read_csv("./Evaluation_gt.csv")
search_results.head()

Unnamed: 0,doc,query_id,label
0,doc_2052,Q3,1
1,doc_164,Q3,1
2,doc_411,Q3,1
3,doc_1805,Q3,1
4,doc_3442,Q3,1


1. Baseline evaluation

a. Information need 1: What is the discussion regarding a tank in Kharkiv?


In [23]:
# Then, perform a query and rank the documents
query = 'tank in Kharkiv'

ranked_docs, doc_vectors = search_tf_idf(query, total_index)

# Display the top-ranked documents with their tweet text
top = 10
print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for doc_id in ranked_docs[:top]:
    print("tweet_id= {} - Text: {}".format(doc_id, tweet_dicts[int(doc_id)]['Tweet Text']))


Sample of 10 results out of 6 for the searched query:

tweet_id= 2233 - Text: ['destroy', 'ukrainian', 'tank', 'kharkiv', 'region']
tweet_id= 591 - Text: ['footag', 'russian', 'tank', 'knock', 'ukrainian', 'forc', "ol'hivka", 'kharkiv', 'oblast']
tweet_id= 1927 - Text: ['ukrainian', 'soldier', 'pose', 'front', 'destroy', 'russian', 'armi', 't72b3', 'main', 'battl', 'tank', 'kharkiv', 'oblast']
tweet_id= 2700 - Text: ['kharkiv', 'region', 'air', 'reconnaiss', 'state', 'border', 'servic', 'elimin', 'tank', 'tigr', 'armor', 'vehicl', 'two', 'truck']
tweet_id= 1969 - Text: ['ukrainian', 'forc', 'find', 'two', 'abandon', 'russian', 'armi', 'main', 'battl', 'tank', 'kharkiv', 'oblast', 'one', 't80bv', 't80u']
tweet_id= 3477 - Text: ['russian', 'troop', 'hit', 'manufactur', 'workshop', 'armour', 'vehicl', 'plant', 'near', 'kharkiv', 'storag', '90', 'tank', 'vehicl', 'afu', 'also', 'vehicl', 'repair', 'point', 'energomekhkomplekt', 'plant', 'hit', 'zaporizhy', 'russian', 'mod']


b. Information Need 2: What discussions are there about the Nord Stream pipeline?

In [24]:
# Then, perform a query and rank the documents
query = 'the Nord Stream pipeline'

ranked_docs, doc_vectors = search_tf_idf(query, total_index)

# Display the top-ranked documents with their tweet text
top = 10
print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for doc_id in ranked_docs[:top]:
    print(doc_id)
    print("tweet_id= {} - Text: {}".format(doc_id, tweet_dicts[int(doc_id)]['Tweet Text']))


Sample of 10 results out of 18 for the searched query:

622
tweet_id= 622 - Text: ['putin', 'anglosaxon', 'blew', 'nord', 'stream', 'pipelin']
617
tweet_id= 617 - Text: ["anglosaxon'", 'blew', 'nord', 'stream', 'pipelin', 'putin']
1451
tweet_id= 1451 - Text: ['moscow', 'insist', 'probe', 'circumst', 'unpreced', 'attack', 'russian', 'nord', 'stream', 'nord', 'stream', '2', 'ga', 'pipelin']
3420
tweet_id= 3420 - Text: ['whodunnit', 'fact', 'relat', 'sabotag', 'attack', 'nord', 'stream', 'pipelin']
2907
tweet_id= 2907 - Text: ['sabotag', 'caus', 'destruct', 'string', 'nord', 'stream', 'nord', 'stream', '2', 'ga', 'pipelin', 'could', 'lead', 'escal', 'even', 'war', 'former', 'us', 'presid', 'donald', 'trump']
3343
tweet_id= 3343 - Text: ['polish', 'member', 'european', 'parliament', 'say', 'us', 'blew', 'nord', 'stream', 'pipelin']
2676
tweet_id= 2676 - Text: ['opinion', 'russia', 'like', 'culprit', 'nord', 'stream', 'pipelin', 'sabotag', 'washington', 'post']
3912
tweet_id= 3912 - Text: 

c. Information need 3: What is being said about the annexation of territories by Russia?

In [25]:
# Then, perform a query and rank the documents
query = 'the Nord Stream pipeline'

ranked_docs, doc_vectors = search_tf_idf(query, total_index)

# Display the top-ranked documents with their tweet text
top = 10
print("\n======================\nSample of {} results out of {} for the searched query:\n".format(top, len(ranked_docs)))
for doc_id in ranked_docs[:top]:
    print("tweet_id= {} - Text: {}".format(doc_id, tweet_dicts[int(doc_id)]['Tweet Text']))


Sample of 10 results out of 18 for the searched query:

tweet_id= 622 - Text: ['putin', 'anglosaxon', 'blew', 'nord', 'stream', 'pipelin']
tweet_id= 617 - Text: ["anglosaxon'", 'blew', 'nord', 'stream', 'pipelin', 'putin']
tweet_id= 1451 - Text: ['moscow', 'insist', 'probe', 'circumst', 'unpreced', 'attack', 'russian', 'nord', 'stream', 'nord', 'stream', '2', 'ga', 'pipelin']
tweet_id= 3420 - Text: ['whodunnit', 'fact', 'relat', 'sabotag', 'attack', 'nord', 'stream', 'pipelin']
tweet_id= 2907 - Text: ['sabotag', 'caus', 'destruct', 'string', 'nord', 'stream', 'nord', 'stream', '2', 'ga', 'pipelin', 'could', 'lead', 'escal', 'even', 'war', 'former', 'us', 'presid', 'donald', 'trump']
tweet_id= 3343 - Text: ['polish', 'member', 'european', 'parliament', 'say', 'us', 'blew', 'nord', 'stream', 'pipelin']
tweet_id= 2676 - Text: ['opinion', 'russia', 'like', 'culprit', 'nord', 'stream', 'pipelin', 'sabotag', 'washington', 'post']
tweet_id= 3912 - Text: ['swedish', 'militari', 'show', 'new',

2. Binary relevance

In [26]:
print_result = search_results["label"].unique()
print("The ground truth of our dataset is composed of {} Relevance Levels: {}".format(len(print_result), sorted(print_result)))

search_results["is_relevant"] = search_results["label"].apply(lambda y: 1 if y == 1 else 0)
search_results.head()

# 1. "Ukraine and Russia"
query = "Ukraine and Russia"
# 


The ground truth of our dataset is composed of 2 Relevance Levels: [0, 1]


3. Evaluate your algorithm by using different evaluation techniques

R@K

In [30]:
def recall_at_k(label, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(label, order[:k])
    relevant = sum(label == 1)
    return float(sum(doc_score)) / relevant

current_query = 0
current_query_res = search_results[search_results["query_id"] == current_query]

k = 5
#print("==> Recall@{}: \n".format(k, recall_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], k)))

IndexError: cannot do a non-empty take from an empty axes.

P@K

In [31]:

def precision_at_k(label, y_score, k=10):

    order = np.argsort(y_score)[::-1]
    doc_score = np.take(label, order[:k])
    relevant = sum(label == 1)
    return float(relevant) / k

# Check for query 0

current_query = 0
current_query_res = search_results[search_results["query_id"] == current_query]

k = 5
#print("==> Precision@{}: {}\n".format(k, precision_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], k)))
print("\nCheck on the dataset sorted by score:\n")
#current_query_res.sort_values("score", ascending=False).head(k)
current_query_res.sort_values("predicted_relevance", ascending=False).head(k)

KeyError: 'predicted_relevance'

F1-Score@K

In [None]:
def f1_score_at_k(label, y_score, k=10):
    precision = precision_at_k(label, y_score, k)
    recall = recall_at_k(label, y_score, k)
    
    if precision + recall == 0:
        return 0.0
    
    return 2 * (precision * recall) / (precision + recall)

In [None]:
current_query = 0
current_query_res = search_results[search_results["query_id"] == current_query]

k = 5
f1_score = f1_score_at_k(current_query_res["is_relevant"], current_query_res["predicted_relevance"], k)
print("==> F1-Score@{}: {}\n".format(k, f1_score))


AP@K

In [None]:
def avg_precision_at_k(label, y_score, k=10):

    gtp = np.sum(label == 1)
    order = np.argsort(y_score)[::-1]
    doc_score = np.take(label, order[:k])
    ## if all documents are not relevant
    if gtp == 0:
        return 0
    n_relevant_at_i = 0
    prec_at_i = 0
    for i in range(len(label)):
        if doc_score[i] == 1:
            n_relevant_at_i += 1
            prec_at_i += n_relevant_at_i / (i + 1)
    return prec_at_i / gtp

In [None]:
avg_precision_at_k(np.array(current_query_res["is_relevant"]), np.array(current_query_res["predicted_relevance"]), 150)

mAP

In [None]:
def map_at_k(search_res, k=10):

    avp = []
    for q in search_res["query_id"].unique():  # loop over all query id
        curr_data = search_res[search_res["query_id"] == q]  # select data for current query
        avp.append(avg_precision_at_k(np.array(curr_data["is_relevant"]),
                   np.array(curr_data["predicted_relevance"]), k))  #append average precision for current query
    return np.sum(avp) / len(avp), avp  # return mean average precision

In [None]:
map_k, avp = map_at_k(search_results, 10)
map_k

MRR

In [None]:
def rr_at_k(doc_score, y_score, k=10):


    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[
                             :k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    if np.sum(doc_score) == 0:  # if there are not relevant doCument return 0
        return 0
    return 1 / (np.argmax(doc_score == 1) + 1)  # hint: to get the position of the first relevant document use "np.argmax"


NDCG

In [None]:
def dcg_at_k(doc_score, y_score, k=10):
    order = np.argsort(y_score)[::-1]  # get the list of indexes of the predicted score sorted in descending order.
    doc_score = np.take(doc_score, order[:k])  # sort the actual relevance label of the documents based on predicted score(hint: np.take) and take first k.
    gain = 2 ** doc_score - 1  # Compute gain (use formula 7 above)
    discounts = np.log2(np.arange(len(doc_score)) + 2)  # Compute denominator
    return np.sum(gain / discounts)  #return dcg@k


def ndcg_at_k(doc_score, y_score, k=10):
    dcg_max = dcg_at_k(doc_score, doc_score, k)
    if not dcg_max:
        return 0
    return np.round(dcg_at_k(doc_score, y_score, k) / dcg_max, 4)

AVCJKHK

In [None]:

tfidf_array = np.array(doc_vectors)

# Check the shape of tfidf_array
print("Shape of tfidf_array:", tfidf_array.shape)

# Set an appropriate perplexity value
perplexity = 30  # You can adjust this value as needed

# Apply T-SNE for dimensionality reduction
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
model.fit_transform(tfidf_array)

In [None]:
# Scatter plot
plt.figure(figsize=(10, 8))
plt.scatter(tsne_result[:, 0], tsne_result[:, 1], marker='o')

# Annotate each point with the corresponding tweet text
for i, txt in enumerate(tweets):
    plt.annotate(txt, (tsne_result[i, 0], tsne_result[i, 1]))

plt.title("T-SNE Visualization of Tweets")
plt.show()