# Documents Similarity

## Read Files

In [1]:
path_name_documents = './Databases/prova/prova.jsonl'
#path_name_documents = './Databases/prova/prova10.jsonl'
#path_name_documents = './Databases/prova/prova50.jsonl'
#path_name_documents = './Databases/prova/prova5000.jsonl' 
#path_name_documents = './Databases/prova/prova30000.jsonl'
#path_name_documents = './Databases/prova/prova2000.jsonl'

In [2]:
import json
import numpy as np
import string

def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)


In [3]:
import time

def time_it(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Execution time: {end_time - start_time:.5f} seconds")
        return result
    return wrapper

## Tokenized

In [4]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs


tokenized_docs = tokenize(path_name_documents)


# Sparse Vectors

## TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


def calculateTFIDF(tokenized_docs):
    
    vectorizer = TfidfVectorizer()
    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
    
        

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)

## Cosine Similarity

In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def similarity(tfidf_matrix):
    # calcoliamo la cosine similarity tra i documenti
    cos_sim = cosine_similarity(tfidf_matrix)

    # creiamo una tabella con le cosine similarity per ogni coppia di documenti
    sim_table = pd.DataFrame(cos_sim, columns=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])], index=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])])
    
    return sim_table, cos_sim

cos_sim_table, cos_sim = similarity(tfidf_matrix_docs)
cos_sim_table


Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5
Doc 1,1.0,0.483146,0.323318,0.323318,0.0
Doc 2,0.483146,1.0,0.0,0.0,0.0
Doc 3,0.323318,0.0,1.0,1.0,0.0
Doc 4,0.323318,0.0,1.0,1.0,0.0
Doc 5,0.0,0.0,0.0,0.0,1.0


## Sequential algorithms

In [7]:
from scipy.sparse import coo_matrix

def extract_document_terms(tfidf_matrix):
    matrix_coo = coo_matrix(tfidf_matrix)
    data = matrix_coo.data
    row = matrix_coo.row
    col = matrix_coo.col

    doc_terms = []
    current_doc = -1
    terms = []

    for i in range(len(data)):
        doc_id = row[i]
        term_id = col[i]
        term_value = data[i]

        if doc_id != current_doc:
            if terms:
                doc_terms.append((current_doc+1, terms))
                terms = []
            current_doc = doc_id

        terms.append((term_id, term_value))

    if terms:
        doc_terms.append((current_doc, terms))

    return doc_terms

doc_info_list = extract_document_terms(tfidf_matrix_docs)
#print(doc_info_list)

In [8]:
def my_map(doc_info):
    mapped_doc = []

    doc_id, terms = doc_info
    
    max_term_id = max(terms, key=lambda x: x[0])[0]
    doc_terms = [(term_id, value) for term_id, value in terms]
    mapped_doc.append((doc_id, max_term_id, doc_terms))

    return mapped_doc

In [9]:
def apply_my_map(doc_info_list):
    total_mapped = []

    for doc_info in doc_info_list:
        mapped = my_map(doc_info)
        total_mapped.extend(mapped)

    return total_mapped



In [35]:
def my_reduce(docs, threshold):
    pairs = []
    docs_list = list(docs)  # Convert the docs iterator to a list

    n_docs = len(docs_list)
    print(n_docs)
    
    for i in range(n_docs - 1):
        for j in range(i + 1, n_docs):
            doc1_id, term_id, doc1 = docs_list[i]
            doc2_id, _, doc2 = docs_list[j]
            
            
            terms_1 = {t_id1: val1 for t_id1, val1 in doc1}
            
            terms_2 = {t_id2: val2 for t_id2, val2 in doc2}
            print("doc1 ",doc1_id)
            print("term1 ",terms_1)
            print("doc2 ",doc2_id)
            print("term2 ",terms_2)

            common_terms = set(terms_1).intersection(terms_2)
            
            print("com",common_terms)
            
            if not common_terms:
                continue

            max_term = max(common_terms)
            print("max ", max_term)
            print("term_id ", term_id)
            print()
            
            
            

            sim = 0.0

            for term in common_terms:
                sim += terms_1[term] * terms_2[term]

            if sim >= threshold:
                if doc2_id == n_docs:
                    doc2_id += 1
                pair = ((doc1_id, doc2_id), sim)
                pairs.append(pair)

    return pairs





In [17]:
mapped_list = apply_my_map(doc_info_list)

In [36]:
pairs = my_reduce(mapped_list,0.1)
pairs

5
doc1  1
term1  {6: 0.7694470729725092, 5: 0.6387105775654869}
doc2  2
term2  {2: 0.7782829228046183, 6: 0.6279137616509933}
com {6}
max  6
term_id  6

doc1  1
term1  {6: 0.7694470729725092, 5: 0.6387105775654869}
doc2  3
term2  {1: 0.6098184563533858, 7: 0.6098184563533858, 5: 0.5062044059286201}
com {5}
max  5
term_id  6

doc1  1
term1  {6: 0.7694470729725092, 5: 0.6387105775654869}
doc2  4
term2  {1: 0.6098184563533858, 7: 0.6098184563533858, 5: 0.5062044059286201}
com {5}
max  5
term_id  6

doc1  1
term1  {6: 0.7694470729725092, 5: 0.6387105775654869}
doc2  4
term2  {4: 0.5773502691896258, 0: 0.5773502691896258, 3: 0.5773502691896258}
com set()
doc1  2
term1  {2: 0.7782829228046183, 6: 0.6279137616509933}
doc2  3
term2  {1: 0.6098184563533858, 7: 0.6098184563533858, 5: 0.5062044059286201}
com set()
doc1  2
term1  {2: 0.7782829228046183, 6: 0.6279137616509933}
doc2  4
term2  {1: 0.6098184563533858, 7: 0.6098184563533858, 5: 0.5062044059286201}
com set()
doc1  2
term1  {2: 0.7782829

[((1, 2), 0.48314640598151465),
 ((1, 3), 0.32331810847686315),
 ((1, 4), 0.32331810847686315),
 ((3, 4), 1.0)]

## Execution

In [13]:
import os
os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'

In [14]:
doc_info_list = extract_document_terms(tfidf_matrix_docs)