In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install git+https://github.com/LIAAD/yake
import yake

Collecting git+https://github.com/LIAAD/yake
  Cloning https://github.com/LIAAD/yake to /tmp/pip-req-build-6ajlsfi_
  Running command git clone -q https://github.com/LIAAD/yake /tmp/pip-req-build-6ajlsfi_


In [None]:
file1 = open("/content/drive/MyDrive/IR_Tweets_Data/twitter_base_preprocessed.pkl", "rb")
df = pickle.load(file1)
file1.close()

In [None]:
from collections import defaultdict
from math import log, sqrt
import re
import numpy as np
import sys
from copy import deepcopy
import pandas as pd
import numpy as np
import csv
import json
from itertools import islice
from collections import OrderedDict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from glob import glob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tqdm import tqdm
import pickle
import math
from sklearn.model_selection import train_test_split
import operator
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import datetime


# ## Import the data ad create the inverted index



def import_dataset():
    """
    This function import all the articles in the TIME corpus,
    returning list of lists where each sub-list contains all the
    terms present in the document as a string.
    """
    # articles = []
    # with open('TIME.ALL', 'r') as f:
    #     tmp = []
    #     for row in f:
    #         if row.startswith("*TEXT"):
    #             if tmp != []:
    #                 articles.append(tmp)
    #             tmp = []
    #         else:
    #             row = re.sub(r'[^a-zA-Z\s]+', '', row)
    #             tmp += row.split()
    # return articles

    docs_preprocessed = []
    path = '/content/drive/MyDrive/Tweelink_Articles_Processed'
    for filename in glob(os.path.join(path, '*')):
      with open(os.path.join(os.getcwd(), filename), 'r', encoding = 'utf-8',errors = 'ignore') as f:
        filename = os.path.basename(f.name)
        data = json.load(f)
        d_date = data["Date"]
        if(d_date=="" or d_date=="Date"):
          continue
        format = '%Y-%m-%d'
    
        d_present_date = datetime.datetime.strptime(d_date, format)
    
        if(str(d_present_date.date()) not in [str(u_present_date.date()), str(u_prev_date.date()), str(u_next_date.date())]):
          continue
      
        # docs_preprocessed.append({'Name':filename, 'Data':data})
        docs_preprocessed.append(data['Body_processed'])
    return docs_preprocessed




# def remove_stop_words(corpus):
#    '''
#    This function removes from the corpus all the stop words present in the file TIME.STP
#    '''
#    stop_w = [line.rstrip('\n') for line in open('TIME.STP')]
#    stop_w=list(filter(None, stop_w))
#    for i in range(0,len(corpus)):
#        corpus[i] = [x for x in corpus[i] if x not in stop_w]
#    return corpus 


def make_inverted_index(corpus):
    """
    This function builds an inverted index as an hash table (dictionary)
    where the keys are the terms and the values are ordered lists of
    docIDs containing the term.
    """
    # corpus = remove_stop_words(corpus)
    index = defaultdict(set)
    for docid, article in enumerate(corpus):
        for term in article:
            index[term].add(docid)
    return index


# ### Union of two posting lists


def posting_lists_union(pl1, pl2):
        """
        Returns a new posting list resulting from the union of the
        two lists passed as arguments.
        """
        pl1 = sorted(list(pl1))
        pl2 = sorted(list(pl2))
        union = []
        i = 0
        j = 0
        while (i < len(pl1) and j < len(pl2)):
            if (pl1[i] == pl2[j]):
                union.append(pl1[i])
                i += 1
                j += 1
            elif (pl1[i] < pl2[j]):
                union.append(pl1[i])
                i += 1
            else:
                union.append(pl2[j])
                j += 1
        for k in range(i, len(pl1)):
            union.append(pl1[k])
        for k in range(j, len(pl2)):
            union.append(pl2[k])
        return union


# ## Precomputing weights


def DF(term, index):
    '''
    Function computing Document Frequency for a term.
    '''
    return len(index[term])


def IDF(term, index, corpus):
    '''
    Function computing Inverse Document Frequency for a term.
    '''
    return log(len(corpus)/DF(term, index))


def RSV_weights(corpus,index):
    '''
    This function precomputes the Retrieval Status Value weights
    for each term in the index
    '''
    N = len(corpus)
    w = {}
    for term in index.keys():
        p = DF(term, index)/(N+0.5)  
        w[term] = IDF(term, index, corpus) + log(p/(1-p))
    return w
    


# ## BIM Class


class BIM():
    '''
    Binary Independence Model class
    '''
    
    def __init__(self, corpus):
        self.original_corpus = deepcopy(corpus)
        self.articles = corpus
        self.index = make_inverted_index(self.articles)
        self.weights = RSV_weights(self.articles, self.index)
        self.ranked = []
        self.query_text = ''
        self.N_retrieved = 0
    
    
        
    def RSV_doc_query(self, doc_id, query):
        '''
        This function computes the Retrieval Status Value for a given couple document - query
        using the precomputed weights
        '''
        score = 0
        doc = self.articles[doc_id]
        for term in doc:
            if term in query:
                score += self.weights[term]     
        return score

    
        
    def ranking(self, query):
        '''
        Auxiliary function for the function answer_query. Computes the score only for documents
        that are in the posting list of al least one term in the query
        '''

        docs = []
        for term in self.index: 
            if term in query:
                docs = posting_lists_union(docs, self.index[term])
                
        scores = []
        for doc in docs:
            scores.append((doc, self.RSV_doc_query(doc, query)))
        
        self.ranked = sorted(scores, key=lambda x: x[1], reverse = True)
        return self.ranked
    
    
    
    def recompute_weights(self, relevant_idx, query):
        '''
        Auxiliary function for relevance_feedback function and
        for the pseudo relevance feedback in answer_query function.
        Recomputes the weights, only for the terms in the query
        based on a set of relevant documents.
        '''
        
        relevant_docs = []
        for idx in relevant_idx:
            doc_id = self.ranked[idx-1][0]
            relevant_docs.append(self.articles[doc_id])
        
        N = len(self.articles)
        N_rel = len(relevant_idx)
        
        for term in query:
            if term in self.weights.keys():
                vri = 0
                for doc in relevant_docs:
                    if term in doc:
                        vri += 1
                p = (vri + 0.5) /( N_rel + 1)
                u = (DF(term, self.index) - vri + 0.5) / (N - N_rel +1)
                self.weights[term] = log((1-u)/u) + log(p/(1-p))

            
    
    def answer_query(self, query_text):
        '''
        Function to answer a free text query. Shows the first 30 words of the
        15 most relevant documents. 
        Also implements the pseudo relevance feedback with k = 5
        '''
        
        self.query_text = query_text
        query =  query_text.upper().split()
        ranking = self.ranking(query)
        
        ## pseudo relevance feedback 
        i = 0
        new_ranking=[]
        while i<10 and ranking != new_ranking:
            self.recompute_weights([1,2,3,4,5], query)
            new_ranking = self.ranking(query)
            i+=1
        
        ranking = new_ranking
        
        self.N_retrieved = 15
        
        ## print retrieved documents
        print(len(ranking))
        for i in range(0, self.N_retrieved):
            
            article = self.original_corpus[ranking[i][0]]
            if (len(article) > 30):
                article = article[0:30]
            text = " ".join(article)
            print(f"Article {i + 1}, score: {ranking[i][1]}")
            print(text, '\n')

        self.weights = RSV_weights(self.articles, self.index)


            
    def relevance_feedback(self, *args):
        '''
        Function that implements relevance feedback for the last query formulated.
        The set of relevant documents is given by the user through a set of indexes
        '''
        if(self.query_text == ''):
            print('Cannot get feedback before a query is formulated.')
            return
        
        relevant_idx = list(args)
        
        if(isinstance(relevant_idx[0], list)):
            relevant_idx = relevant_idx[0]
        
        query = self.query_text.upper().split()
        self.recompute_weights(relevant_idx,query)
        
        self.answer_query(self.query_text)
    
    
    
    def read_document(self,rank_num):
        '''
        Function that allows the user to select a document among the ones returned 
        by answer_query and read the whole text
        '''
        if(self.query_text == ''):
            print('Cannot select a document before a query is formulated.')
            return
            
        article = self.original_corpus[self.ranked[rank_num - 1][0]]
        text = " ".join(article)
        print(f"Article {rank_num}, score: {self.ranked[rank_num][1]}")
        print(text, '\n')
        
        
    def show_more(self):
        '''
        Function that allows the user to see more 10 retrieved documents
        '''
        
        if(self.N_retrieved + 10 > len(self.ranked)):
            print('No more documents available')
            return 
        
        for i in range(self.N_retrieved, self.N_retrieved+10):
            article = self.original_corpus[self.ranked[i][0]]
            if (len(article) > 30):
                article = article[0:30]
            text = " ".join(article)
            print(f"Article {i + 1}, score: {self.ranked[i][1]}")
            print(text, '\n')
        
        self.N_retrieved += 10 
        



# Example of usage

# articles = import_dataset()
# bim  = BIM(articles)
# bim.answer_query('Italy and Great Britain fight the enemy')
# bim.relevance_feedback(5,6,8)
# bim.show_more()
# bim.read_document(2)

In [None]:
u_base_hashtag = input("Enter base hashtag: ")
u_time = input("Enter time: ")
u_location = input("Enter Location: ")

Enter base hashtag: hijab
Enter time: 2022-02-19
Enter Location: India


In [None]:
tweet_query = []
format = '%Y-%m-%d'
u_present_date = datetime.datetime.strptime(u_time, format)
u_prev_date = u_present_date - datetime.timedelta(days=1)
u_next_date = u_present_date + datetime.timedelta(days=1)
df_query = df.loc[df['hashtags'].str.contains(u_base_hashtag) & df['Date_Only'].isin([str(u_present_date.date()), str(u_prev_date.date()), str(u_next_date.date())])]

In [None]:
for tweet in df_query['Preprocessed_Data']:
  tweet_query.extend(tweet)

In [None]:
tweet_keywords = []
kw_extractor = yake.KeywordExtractor(top=100, stopwords=None)
keywords = kw_extractor.extract_keywords(' '.join(tweet_query))
for kw, v in keywords:
  # print("Keyphrase: ",kw, ": score", v)
  for key in kw.split():
    if(key not in tweet_keywords):
      tweet_keywords.append(key)

print(tweet_keywords)


['hijab', 'karnataka', 'india', 'muslim', 'islam', 'hijabisfundamentalright', 'hijabisourright', 'hijabcontroversy', 'started', 'abaya', 'hijabrow', 'http', 'persecution', 'wear', 'allah', 'woman', 'karnatakahijabcontroversy', 'wearing', 'school', 'world', 'modestfashion', 'quran', 'hijabisindividualright', 'judge', 'modest', 'deen', 'prophetmuhammad', 'dua', 'makkah', 'college', 'hijabban', 'religion', 'china', 'remove', 'islamophobia', 'girl', 'hindu', 'education', 'islamist', 'hijabisourpride', 'hijaboruniform', 'student', 'permission', 'beard', 'freedom', 'asian', 'allowed', 'public', 'saudi', 'hijabplot', 'hijabaurkitab', 'class', 'niqab', 'muslimmen', 'leader']


In [None]:
# Example of usage

articles = import_dataset()
bim  = BIM(articles)
bim.answer_query(" ".join(tweet_keywords))
# bim.relevance_feedback(5,6,8)
# bim.show_more()
# bim.read_document(2)

0


IndexError: ignored