# Ranking

Second Assignement of Learning with Massive Data. 


In [4]:
import json
import numpy as np
import string

path_name_documents = './Databases/prova/gigi.jsonl'
path_name_query = './Databases/prova/query.jsonl'


def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)
#print(documents)
query = readFile(path_name_query)
#print(query)


# Tokenize

In [5]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs

tokenized_docs = tokenize(path_name_documents)


tokenized_query = tokenize(path_name_query)




# TF

In [3]:
import collections
import nltk
import math


def calculateTF(tokenized_docs):
    term_freqs = []
    for doc in tokenized_docs:
        doc_freq = collections.Counter(doc) #number of repetition for each word
       
        total_terms = len(doc) #length for each document
        
        for term in doc_freq:
            doc_freq[term] /= float(total_terms)
        term_freqs.append(doc_freq)

    return term_freqs



def calculateTF2(tokenized_docs):
    term_freqs = []
    for doc in tokenized_docs:
        doc_freq = {}
        total_terms = len(doc)
        for term in doc:
            doc_freq[term] = doc_freq.get(term, 0) + 1
        for term in doc_freq:
            doc_freq[term] /= float(total_terms)
        term_freqs.append(doc_freq)

    return term_freqs



def calculateTFbm25(tokenized_docs):
    term_freqs = []
    k1 = 1.5 # parameter for controlling term frequency normalization
    b = 0.75 # parameter for controlling document length normalization
    avgdl = sum(len(doc) for doc in tokenized_docs) / len(tokenized_docs) # average document length

    for doc in tokenized_docs:
        doc_freq = collections.Counter(doc) #number of repetition for each word
        total_terms = len(doc) #length for each document
        doc_len_norm = ((1 - b) + b * (total_terms / avgdl)) # document length normalization factor

        for term in doc_freq:
            tf = doc_freq[term] / total_terms # term frequency
            tf_norm = ((k1 + 1) * tf) / (k1 * ((1 - b) + b * (total_terms / avgdl)) + tf) # normalized term frequency with BM25 weighting
            doc_freq[term] = tf_norm

        term_freqs.append(doc_freq)

    return term_freqs

    
#term_freqs = calculateTF2(tokenized_docs)
term_freqs = calculateTFbm25(tokenized_docs)

print(term_freqs)

[Counter({'in': 0.05852609374515299, 'ho': 0.05215886798668843, 'of': 0.05215886798668843, 'and': 0.045758344661276214, 'to': 0.045758344661276214, 'the': 0.045758344661276214, 'a': 0.032856355034394685, 'co': 0.032856355034394685, 'heme': 0.026354356675486374, 'it': 0.026354356675486374, 'play': 0.01981799656161261, 'student': 0.01324700157864003, 'oxygenas': 0.01324700157864003, 'an': 0.01324700157864003, 'stress': 0.01324700157864003, 'role': 0.01324700157864003, 'by': 0.01324700157864003, 'antiinflammatori': 0.01324700157864003, 'effect': 0.01324700157864003, 'for': 0.01324700157864003, 'with': 0.01324700157864003, 'pulmonari': 0.01324700157864003, 'stude': 0.006641095710391907, 'studi': 0.006641095710391907, 'induc': 0.006641095710391907, 'protein': 0.006641095710391907, 'confer': 0.006641095710391907, 'cytoprotect': 0.006641095710391907, 'against': 0.006641095710391907, 'oxid': 0.006641095710391907, 'vitro': 0.006641095710391907, 'vivo': 0.006641095710391907, 'addit': 0.006641095

# IDF

In [23]:
import math
import collections
import nltk
from collections import defaultdict

def calculateIDF(tokenized_docs,term_freqs):
    
    #dictionary with the frequency of each term in all documents
    all_terms = [term for doc in tokenized_docs for term in doc]
    
    df = collections.Counter(all_terms)

    # Calculate IDF for each term
    N = len(tokenized_docs)
    
    idf = {}
    
    for term in df:
        n = len([doc for doc in tokenized_docs if term in doc])
        idf[term] = math.log(N / float(df[term] + 1)) #math.log(1 + (N - n + 0.5)/(n + 0.5))
    return idf




def calculateIDFBM25(tokenized_docs):
    #dictionary with the frequency of each term in all documents
    all_terms = [term for doc in tokenized_docs for term in doc]
    
    df = defaultdict(int)

    # Calculate DF for each term
    for term in all_terms:
        df[term] += 1
    # Calculate IDF for each term
    N = len(tokenized_docs)
    
    idf = {}
    
    for term in df:
        n = len([doc for doc in tokenized_docs if term in doc])
        idf[term] = math.log((N - n + 0.5)/(n + 0.5)+1)
    
    return idf




#idf = calculateIDFBM25(tokenized_docs)
idf = calculateIDF(tokenized_docs,term_freqs)
print(idf)





{'play': -0.6931471805599453, 'student': -0.40546510810816444, 'stude': 0.0, 'studi': 0.0, 'heme': -0.916290731874155, 'oxygenase1': 0.0, 'ho1': -1.5040773967762742, 'an': -0.40546510810816444, 'induc': 0.0, 'stress': -0.40546510810816444, 'protein': 0.0, 'confer': 0.0, 'cytoprotect': 0.0, 'against': 0.0, 'oxid': 0.0, 'in': -1.791759469228055, 'vitro': 0.0, 'and': -1.791759469228055, 'vivo': 0.0, 'addit': 0.0, 'to': -1.3862943611198906, 'it': -0.916290731874155, 'physiolog': 0.0, 'role': -0.40546510810816444, 'degrad': 0.0, 'may': 0.0, 'influenc': 0.0, 'a': -1.3862943611198906, 'number': 0.0, 'of': -2.0794415416798357, 'cellular': 0.0, 'process': 0.0, 'includ': 0.0, 'growth': 0.0, 'inflamm': 0.0, 'apoptosi': 0.0, 'by': -0.916290731874155, 'virtu': 0.0, 'antiinflammatori': -0.40546510810816444, 'effect': -0.40546510810816444, 'limit': 0.0, 'tissu': 0.0, 'damag': 0.0, 'respons': 0.0, 'proinflammatori': 0.0, 'stimulu': 0.0, 'prevent': 0.0, 'allograft': 0.0, 'reject': 0.0, 'after': 0.0, 't

# Step successivi



# TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculateTFIDF(tokenized_docs):
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
 
    
def calculateTFIDFQ(tokenized_query,vectorizer):
    # Initialize the TfidfVectorizer
    #vectorizer = TfidfVectorizer()

    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.transform([' '.join(doc) for doc in tokenized_query])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names

    

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)
tfidf_matrix_query, feature_names_query  = calculateTFIDFQ( tokenized_query,vectorizer)

print("TFIDF- DOCUMENTS:")
#print(tfidf_matrix_docs.toarray())
print(tfidf_matrix_docs)

print("TFIDF- QUERY:")
#print(tfidf_matrix_query.toarray())
print(tfidf_matrix_query)




TFIDF- DOCUMENTS:
  (0, 153)	0.03366972268896855
  (0, 74)	0.0473216204390231
  (0, 109)	0.03366972268896855
  (0, 50)	0.0473216204390231
  (0, 144)	0.0473216204390231
  (0, 169)	0.0473216204390231
  (0, 42)	0.0473216204390231
  (0, 21)	0.0473216204390231
  (0, 184)	0.0473216204390231
  (0, 43)	0.0473216204390231
  (0, 165)	0.0473216204390231
  (0, 66)	0.0473216204390231
  (0, 68)	0.0473216204390231
  (0, 100)	0.03366972268896855
  (0, 129)	0.0946432408780462
  (0, 69)	0.0473216204390231
  (0, 36)	0.0473216204390231
  (0, 150)	0.0473216204390231
  (0, 30)	0.0473216204390231
  (0, 60)	0.0473216204390231
  (0, 55)	0.0473216204390231
  (0, 54)	0.0473216204390231
  (0, 93)	0.0473216204390231
  (0, 29)	0.0473216204390231
  (0, 61)	0.0473216204390231
  :	:
  (1, 174)	0.0883131559753662
  (1, 31)	0.0883131559753662
  (1, 134)	0.0883131559753662
  (1, 143)	0.0883131559753662
  (1, 47)	0.0883131559753662
  (1, 181)	0.1766263119507324
  (1, 49)	0.0883131559753662
  (1, 26)	0.1766263119507324
  (

# Dot Product of Sparse Vectors

In [93]:
import numpy as np


def calculate_Dot_Product(tfidf_matrix_docs,tfidf_matrix_query):
    
    for i in len
    results = np.dot(tfidf_matrix_docs,tfidf_matrix_query[0])
    return results

dot_result = calculate_Dot_Product(tfidf_matrix_docs,tfidf_matrix_query)
print(dot_result)

SyntaxError: invalid syntax (3995525270.py, line 7)

In [6]:
import math

def bm25_tf_idf(tokenized_docs, term_freqs, idf):
    k1 = 1.2
    b = 0.75
    N = len(tokenized_docs)
    avg_doc_len = sum(len(doc) for doc in tokenized_docs) / N
    scores = []
    for i in range(N):
        score = 0
        doc = tokenized_docs[i]
        doc_len = len(doc)
        for term, freq in term_freqs[i].items():
            if term in idf:
                idf_val = idf[term]
                tf = ((k1 + 1) * freq) / (k1 * ((1 - b) + (b * (doc_len / avg_doc_len))) + freq)
                score += tf * idf_val
        scores.append(score)
    return scores

score = bm25_tf_idf(tokenized_docs, term_freqs, idf)

print(score)

[1.0687327120513277, 3.0164868921587815]


# Sparse Vector

# Dense Vector

In [14]:
from transformers import AutoTokenizer, AutoModel

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/minilm-l6-hybrid-512")
model = AutoModel.from_pretrained("microsoft/minilm-l6-hybrid-512")

# Define function to convert an array of text into dense vectors
def create_dense_vectors(inputs):
    # Tokenize the texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    # Pass the inputs through the model
    with torch.no_grad():
        outputs = model(**tokenized_docs, output_hidden_states=True)

    # Get the last hidden state of the model as the dense vector
    dense_vectors = outputs.hidden_states[-1][:, 0, :].numpy()

    print(dense_vectors.tolist())
    return dense_vectors.tolist()



OSError: microsoft/minilm-l6-hybrid-512 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

# Prova

In [44]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


# Define the documents and queries
docs = [" ", "covid can kill people","covid19 is a bluf", "This is the third document","exist a vaccine to save the people from covid","we mus us vaccine cause covid kill people"]
query = "vaccine for covid19"

# Initialize the vectorizer and compute the TF-IDF matrix for the documents
vectorizer = TfidfVectorizer()
tfidf_matrix_docs = vectorizer.fit_transform(docs)

# Compute the TF-IDF matrix for the query
tfidf_matrix_query = vectorizer.transform([query])

# Compute the dot product between the documents and the query
dot_product = np.dot(tfidf_matrix_docs, tfidf_matrix_query.T)

# Get the top-k documents based on the dot product
k = 1
top_k_indices = np.argsort(dot_product, axis=0)[-k:]
top_k_docs = [docs[i] for i in top_k_indices]

# Compute the average TF-IDF weights for the top-k terms in the top-k documents
top_k_terms = set()
for doc in top_k_docs:
    doc_tfidf = vectorizer.transform([doc])
    doc_weights = np.array(doc_tfidf.mean(axis=0))[0]
    top_k_terms.update(np.argsort(doc_weights)[-k:])

# Expand the query with the top-k terms
expanded_query = query + " " + " ".join([vectorizer.get_feature_names()[i] for i in top_k_terms])

# Compute the TF-IDF matrix for the expanded query
tfidf_matrix_expanded_query = vectorizer.transform([expanded_query])

# Compute the dot product between the documents and the expanded query
dot_product_expanded = np.dot(tfidf_matrix_docs, tfidf_matrix_expanded_query.T)

# Get the top-k documents based on the expanded query
top_k_indices_expanded = np.argsort(dot_product_expanded, axis=0)[-k:]
top_k_docs_expanded = [docs[i] for i in top_k_indices_expanded]

# Print the results
print("Original query: ", query)
print("Top-k documents based on original query: ", top_k_docs)
print("Expanded query: ", expanded_query)
print("Top-k documents based on expanded query: ", top_k_docs_expanded)


Original query:  vaccine for covid19
Top-k documents based on original query:  [' ']
Expanded query:  vaccine for covid19 we
Top-k documents based on expanded query:  [' ']
