# Ranking

Second Assignement of Learning with Massive Data. 


In [4]:
import json
import numpy as np
import string

path_name_documents = './Databases/prova/gigi.jsonl'
path_name_query = './Databases/prova/query.jsonl'


def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)
#print(documents)
query = readFile(path_name_query)
#print(query)


# Tokenize

In [5]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs

tokenized_docs = tokenize(path_name_documents)


tokenized_query = tokenize(path_name_query)




# TF

In [82]:
import collections
import nltk
import math


def calculateTF(tokenized_docs):
    term_freqs = []
    for doc in tokenized_docs:
        doc_freq = collections.Counter(doc) #number of repetition for each word
       
        total_terms = len(doc) #length for each document
        
        for term in doc_freq:
            doc_freq[term] /= float(total_terms)
        term_freqs.append(doc_freq)

    return term_freqs



def calculateTF2(tokenized_docs):
    term_freqs = []
    for doc in tokenized_docs:
        doc_freq = {}
        total_terms = len(doc)
        for term in doc:
            doc_freq[term] = doc_freq.get(term, 0) + 1
        for term in doc_freq:
            doc_freq[term] /= float(total_terms)
        term_freqs.append(doc_freq)

    return term_freqs



def calculateTFbm25(tokenized_docs):
    term_freqs = []
    k1 = 1.5 # parameter for controlling term frequency normalization
    b = 0.75 # parameter for controlling document length normalization
    avgdl = sum(len(doc) for doc in tokenized_docs) / len(tokenized_docs) # average document length

    for doc in tokenized_docs:
        doc_freq = collections.Counter(doc) #number of repetition for each word
        total_terms = len(doc) #length for each document
        doc_len_norm = ((1 - b) + b * (total_terms / avgdl)) # document length normalization factor

        for term in doc_freq:
            tf = doc_freq[term] / total_terms # term frequency
            tf_norm = ((k1 + 1) * tf) / (k1 * ((1 - b) + b * (total_terms / avgdl)) + tf) # normalized term frequency with BM25 weighting
            doc_freq[term] = tf_norm

        term_freqs.append(doc_freq)

    return term_freqs

    
#term_freqs = calculateTF2(tokenized_docs)
term_freqs = calculateTFbm25(tokenized_docs)


# IDF

In [83]:
import math
import collections
import nltk
from collections import defaultdict

def calculateIDF(tokenized_docs,term_freqs):
    
    #dictionary with the frequency of each term in all documents
    all_terms = [term for doc in tokenized_docs for term in doc]
    
    df = collections.Counter(all_terms)

    # Calculate IDF for each term
    N = len(tokenized_docs)
    
    idf = {}
    
    for term in df:
        n = len([doc for doc in tokenized_docs if term in doc])
        idf[term] = math.log(N / float(df[term] + 1)) #math.log(1 + (N - n + 0.5)/(n + 0.5))
    return idf




def calculateIDFBM25(tokenized_docs):
    #dictionary with the frequency of each term in all documents
    all_terms = [term for doc in tokenized_docs for term in doc]
    
    df = defaultdict(int)

    # Calculate DF for each term
    for term in all_terms:
        df[term] += 1
    # Calculate IDF for each term
    N = len(tokenized_docs)
    
    idf = {}
    
    for term in df:
        n = len([doc for doc in tokenized_docs if term in doc])
        idf[term] = math.log((N - n + 0.5)/(n + 0.5)+1)
    
    return idf




#idf = calculateIDFBM25(tokenized_docs)
idf = calculateIDF(tokenized_docs,term_freqs)






# Step successivi



# TF-IDF 

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

def calculateTFIDF(tokenized_docs):
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
 
    
def calculateTFIDFQ(tokenized_query,vectorizer):
    # Initialize the TfidfVectorizer
    #vectorizer = TfidfVectorizer()

    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.transform([' '.join(doc) for doc in tokenized_query])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names

    

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)
tfidf_matrix_query, feature_names_query  = calculateTFIDFQ( tokenized_query,vectorizer)

print("TFIDF- DOCUMENTS:")
#print(tfidf_matrix_docs.toarray())
print(tfidf_matrix_docs)

print("TFIDF- QUERY:")
print(tfidf_matrix_query.toarray())
#print(tfidf_matrix_query)




TFIDF- DOCUMENTS:
  (0, 153)	0.03366972268896855
  (0, 74)	0.0473216204390231
  (0, 109)	0.03366972268896855
  (0, 50)	0.0473216204390231
  (0, 144)	0.0473216204390231
  (0, 169)	0.0473216204390231
  (0, 42)	0.0473216204390231
  (0, 21)	0.0473216204390231
  (0, 184)	0.0473216204390231
  (0, 43)	0.0473216204390231
  (0, 165)	0.0473216204390231
  (0, 66)	0.0473216204390231
  (0, 68)	0.0473216204390231
  (0, 100)	0.03366972268896855
  (0, 129)	0.0946432408780462
  (0, 69)	0.0473216204390231
  (0, 36)	0.0473216204390231
  (0, 150)	0.0473216204390231
  (0, 30)	0.0473216204390231
  (0, 60)	0.0473216204390231
  (0, 55)	0.0473216204390231
  (0, 54)	0.0473216204390231
  (0, 93)	0.0473216204390231
  (0, 29)	0.0473216204390231
  (0, 61)	0.0473216204390231
  :	:
  (1, 174)	0.0883131559753662
  (1, 31)	0.0883131559753662
  (1, 134)	0.0883131559753662
  (1, 143)	0.0883131559753662
  (1, 47)	0.0883131559753662
  (1, 181)	0.1766263119507324
  (1, 49)	0.0883131559753662
  (1, 26)	0.1766263119507324
  (

# Dot Product for Sparse Vectors

In [93]:
import numpy as np


def calculate_Dot_Product(tfidf_matrix_docs, tfidf_matrix_query):
    matrix_docs = tfidf_matrix_docs.toarray()
    matrix_query = tfidf_matrix_query.toarray()
    num_queries = matrix_query.shape[0]
    num_docs = matrix_docs.shape[0]
    sparse_score_results = np.empty(num_queries, dtype=np.ndarray)
    
    for query in range(num_queries):
        sparse_score_docs = np.empty(num_docs)
        for doc in range(num_docs):
            dot_result = np.dot(matrix_query[query], matrix_docs[doc])
            sparse_score_docs[doc] = dot_result
        message = f"query: {query}"
        sparse_score_results[query] = np.array([message] + sparse_score_docs.tolist())
        
    for i in range(num_queries):
        print(sparse_score_results[i][0])
        print(sparse_score_results[i][1:])
        print()
        
    return sparse_score_results



dot_result = calculate_Dot_Product(tfidf_matrix_docs,tfidf_matrix_query)

query: 0
['0.25330518925234474' '0.47194902090194']

query: 1
['0.4366901266404883' '0.29403954155250905']

query: 2
['0.05464230059791856' '0.050987624375363076']

query: 3
['0.0473216204390231' '0.0']

query: 4
['0.19597462672499155' '0.056481157909239896']

query: 5
['0.2332726692999068' '0.25499733860601864']

query: 6
['0.23423007096418416' '0.06244683145816671']

query: 7
['0.38444687571856695' '0.3983050336478946']



In [96]:
import numpy as np
import pandas as pd

def tabella(tfidf_matrix_docs, tfidf_matrix_query):
    matrix_docs = tfidf_matrix_docs.toarray()
    matrix_query = tfidf_matrix_query.toarray()
    num_queries = matrix_query.shape[0]
    num_docs = matrix_docs.shape[0]
    sparse_score_results = np.empty((num_queries, num_docs+1), dtype=np.object)
    
    for query in range(num_queries):
        sparse_score_docs = np.empty(num_docs)
        for doc in range(num_docs):
            dot_result = np.dot(matrix_query[query], matrix_docs[doc])
            sparse_score_docs[doc] = dot_result
        message = f"query: {query}"
        sparse_score_results[query, 0] = message
        sparse_score_results[query, 1:] = sparse_score_docs
    
    df = pd.DataFrame(sparse_score_results, columns=["Query"] + ["Doc " + str(i+1) for i in range(num_docs)])
    return df

df = tabella(tfidf_matrix_docs, tfidf_matrix_query)
print("SCORES SPARSE VECTORS")
print(df)

SCORES SPARSE VECTORS
      Query     Doc 1     Doc 2
0  query: 0  0.253305  0.471949
1  query: 1   0.43669   0.29404
2  query: 2  0.054642  0.050988
3  query: 3  0.047322       0.0
4  query: 4  0.195975  0.056481
5  query: 5  0.233273  0.254997
6  query: 6   0.23423  0.062447
7  query: 7  0.384447  0.398305


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  sparse_score_results = np.empty((num_queries, num_docs+1), dtype=np.object)


# Sparse Vector

# Dense Vector

In [14]:
import torch
from transformers import AutoTokenizer, AutoModel

# Step 1: Load the MiniLM-L6-v2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("allenai/all-MiniLM-L6-v2")


# Step 2: Convert the text into tokens using the tokenizer
text = "This is an example sentence."
encoded_input = tokenizer(text, return_tensors="pt")

# Step 3: Pass the encoded input through the MiniLM-L6-v2 model
with torch.no_grad():
    model_output = model(**encoded_input)

# Step 4: Extract the dense vector from the model output
dense_vector = model_output.last_hidden_state.mean(dim=1).squeeze()

print(dense_vector)




OSError: microsoft/minilm-l6-hybrid-512 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

# INSERISCI ARGOMENTO