# Documents Similarity

## Read Files

In [1]:
path_name_documents = './Databases/prova/prova50.jsonl'
#path_name_documents = './Databases/prova/prova30000.jsonl'
#path_name_documents = './Databases/prova/prova2000.jsonl'

In [2]:
import json
import numpy as np
import string

def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)


In [3]:
import time

def time_it(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Execution time: {end_time - start_time:.5f} seconds")
        return result
    return wrapper

## Tokenized

In [4]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs


tokenized_docs = tokenize(path_name_documents)


# Sparse Vectors

## TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


def calculateTFIDF(tokenized_docs):
    
    vectorizer = TfidfVectorizer()
    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
    
        

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)

In [6]:
print(tfidf_matrix_docs)

  (0, 950)	0.035053147631515756
  (0, 1636)	0.036605667094734645
  (0, 1293)	0.040517703702157036
  (0, 1964)	0.029942793834867684
  (0, 820)	0.036605667094734645
  (0, 838)	0.016053724382224493
  (0, 764)	0.05117107267586202
  (0, 548)	0.028546704572561518
  (0, 1611)	0.04311211015368888
  (0, 1809)	0.03033895676132729
  (0, 1414)	0.02940833019397519
  (0, 478)	0.026994185109342633
  (0, 1411)	0.026994185109342633
  (0, 2161)	0.04645688180567354
  (0, 2051)	0.04645688180567354
  (0, 658)	0.035053147631515756
  (0, 317)	0.035053147631515756
  (0, 603)	0.13937064541702063
  (0, 13)	0.10234214535172403
  (0, 468)	0.05117107267586202
  (0, 1838)	0.03368372841331195
  (0, 835)	0.04311211015368888
  (0, 1658)	0.04311211015368888
  (0, 25)	0.05117107267586202
  (0, 1659)	0.05117107267586202
  :	:
  (58, 1238)	0.04665116583828504
  (58, 1908)	0.09330233167657008
  (58, 1393)	0.06245547533281087
  (58, 1281)	0.04528434767860186
  (58, 1099)	0.05004395333652341
  (58, 619)	0.04064920365317616
 

## Cosine Similarity

In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def similarity(tfidf_matrix):
    # calcoliamo la cosine similarity tra i documenti
    cos_sim = cosine_similarity(tfidf_matrix)

    # creiamo una tabella con le cosine similarity per ogni coppia di documenti
    sim_table = pd.DataFrame(cos_sim, columns=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])], index=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])])
    
    return sim_table, cos_sim

cos_sim_table, cos_sim = similarity(tfidf_matrix_docs)
cos_sim_table


Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10,...,Doc 50,Doc 51,Doc 52,Doc 53,Doc 54,Doc 55,Doc 56,Doc 57,Doc 58,Doc 59
Doc 1,1.0,0.116785,0.116682,0.065408,0.185082,0.063252,0.114855,0.131666,0.082319,0.10333,...,0.158555,0.113504,0.183942,0.0,0.0,0.0,0.0,0.0,0.108963,0.101689
Doc 2,0.116785,1.0,0.293539,0.222886,0.172103,0.138513,0.112282,0.165781,0.22244,0.171459,...,0.258898,0.114252,0.159238,0.0,0.0,0.0,0.0,0.0,0.161876,0.175653
Doc 3,0.116682,0.293539,1.0,0.181477,0.235975,0.105975,0.114559,0.155222,0.17226,0.145168,...,0.23306,0.160828,0.175988,0.0,0.0,0.0,0.0,0.0,0.185843,0.185424
Doc 4,0.065408,0.222886,0.181477,1.0,0.097654,0.06361,0.063011,0.093672,0.162503,0.115923,...,0.161168,0.089334,0.118018,0.0,0.0,0.0,0.0,0.0,0.094309,0.136627
Doc 5,0.185082,0.172103,0.235975,0.097654,1.0,0.096868,0.139804,0.1329,0.14999,0.128149,...,0.193689,0.128157,0.174898,0.0,0.0,0.0,0.0,0.0,0.179021,0.182024
Doc 6,0.063252,0.138513,0.105975,0.06361,0.096868,1.0,0.046924,0.094784,0.082026,0.094177,...,0.14122,0.067741,0.098841,0.0,0.0,0.0,0.0,0.0,0.125477,0.111974
Doc 7,0.114855,0.112282,0.114559,0.063011,0.139804,0.046924,1.0,0.135525,0.097209,0.099001,...,0.18463,0.084729,0.173973,0.0,0.0,0.0,0.0,0.0,0.090519,0.08134
Doc 8,0.131666,0.165781,0.155222,0.093672,0.1329,0.094784,0.135525,1.0,0.109231,0.147724,...,0.271639,0.089889,0.125835,0.0,0.0,0.0,0.0,0.0,0.152414,0.124312
Doc 9,0.082319,0.22244,0.17226,0.162503,0.14999,0.082026,0.097209,0.109231,1.0,0.118235,...,0.165711,0.099433,0.14594,0.0,0.0,0.0,0.0,0.0,0.139159,0.140329
Doc 10,0.10333,0.171459,0.145168,0.115923,0.128149,0.094177,0.099001,0.147724,0.118235,1.0,...,0.236668,0.102157,0.174861,0.0,0.0,0.0,0.0,0.0,0.16592,0.165466


## Sequential algorithms

In [8]:
threshold = 0.2

### For each document create pairs of similar documents

In [9]:
@time_it
def sequential_pair_similar_documents(cos_sim, threshold):   
    # Creiamo una lista di coppie di documenti simili con un valore di similarità superiore al threshold
    sim_pairs = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                sim_pairs.append((i+1, j+1))
            
    return sim_pairs


#sim_pairs = sequential_pair_similar_documents(cos_sim, threshold)
#print(sim_pairs)

In [10]:
@time_it
def sequential_pair_similar_documents2(cos_sim, threshold):
    # Trova gli indici delle celle che superano la soglia di similarità
    sim_idxs = np.argwhere(cos_sim > threshold)
    # Converte gli indici delle celle in coppie di documenti simili
    sim_pairs = [(i+1, j+1) for i, j in sim_idxs if i < j]

    return sim_pairs


#sim_pairs = sequential_pair_similar_documents2(cos_sim, threshold)
#print(sim_pairs)

### For each document calculate the number of similar documents

In [11]:
@time_it
def sequential_count_similar_documents(cos_sim, threshold):
    num_similar = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        num = 0
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                num += 1
        num_similar.append(num)
    return num_similar, threshold

num_similar, threshold = sequential_count_similar_documents(cos_sim, threshold)
#print(num_similar)


Execution time: 0.00101 seconds


### For each document create a list with similar documents

In [12]:
@time_it
def sequential_similar_documents(cos_sim, threshold):
    similar_docs = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        sim_docs = []
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                sim_docs.append(f"Doc {j+1}")
        similar_docs.append(sim_docs)
    return similar_docs

similar_docs = sequential_similar_documents(cos_sim, threshold)


Execution time: 0.00100 seconds


### Table of final results

In [13]:
def create_similar_table(similar_docs, num_similar, threshold):
    doc_names = [f"Doc {i+1}" for i in range(len(num_similar))]
    similar_docs_str = [", ".join(docs) for docs in similar_docs]
    similar_table = pd.DataFrame({"Documents": doc_names, "Number of similar documents": num_similar, "Similar Documents": similar_docs_str, "Threshold": threshold})
    pd.set_option('display.max_rows', None)

    return similar_table

similar_table = create_similar_table(similar_docs, num_similar, threshold)
similar_table

Unnamed: 0,Documents,Number of similar documents,Similar Documents,Threshold
0,Doc 1,1,Doc 43,0.2
1,Doc 2,8,"Doc 3, Doc 4, Doc 9, Doc 30, Doc 33, Doc 34, D...",0.2
2,Doc 3,6,"Doc 5, Doc 16, Doc 30, Doc 34, Doc 40, Doc 50",0.2
3,Doc 4,0,,0.2
4,Doc 5,1,Doc 16,0.2
5,Doc 6,1,Doc 36,0.2
6,Doc 7,1,Doc 17,0.2
7,Doc 8,1,Doc 50,0.2
8,Doc 9,0,,0.2
9,Doc 10,7,"Doc 16, Doc 30, Doc 32, Doc 33, Doc 34, Doc 49...",0.2


# Prove Clustering

In [14]:
def divide_documents(cos_sim_table, threshold):
    # Creiamo un dizionario per i cluster
    clusters = {}
    # Creiamo un set per tenere traccia dei documenti già assegnati a un cluster
    assigned_docs = set()
    # Iteriamo su ogni riga e colonna della tabella di similarità
    for i, row in cos_sim_table.iterrows():
        if i not in assigned_docs:  # Se il documento non è già stato assegnato a un cluster
            cluster = []  # Creiamo un nuovo cluster
            for j, sim in row.iteritems():
                if j != i and j not in assigned_docs and sim >= threshold:  # Se il documento non è già stato assegnato a un cluster, non è se stesso e ha una similarità maggiore o uguale alla soglia
                    cluster.append(j)  # Aggiungiamo il documento al cluster
                    assigned_docs.add(j)  # Segniamo il documento come assegnato
            cluster.append(i)  # Aggiungiamo il documento corrente al cluster
            assigned_docs.add(i)  # Segniamo il documento corrente come assegnato
            clusters[f'Cluster {len(clusters)+1}'] = cluster  # Aggiungiamo il cluster al dizionario di cluster
    return clusters


#clusters = divide_documents(cos_sim_table, 0.2)
#clusters


In [15]:
def find_similar_documents(cos_sim_table, doc_name, similarity_threshold):
    # trova il vettore di similarità del documento specificato
    doc_sim_vector = cos_sim_table[doc_name]

    # trova il numero di documenti che hanno una similarità superiore alla soglia specificata
    similar_docs_count = len([similarity for similarity in doc_sim_vector if similarity > similarity_threshold])

    return similar_docs_count

#similar_docs_count = find_similar_documents(cos_sim_table, 'Doc 1', 0.2)
#print(similar_docs_count-1)

# Parallel Alghoritm 

In [114]:
import os
os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'


In [115]:
path_name_documents = './Databases/prova/prova50.jsonl'

In [62]:
def get_row_values_tfidf(tfidf_matrix, doc_id):
    
    num_docs = tfidf_matrix.shape[0]
    
    # Estrai la riga specificata dalla matrice TF-IDF come un array densamente popolato
    row_array = tfidf_matrix.toarray()[doc_id]

    # Creazione della lista di coppie chiave-valore
    row_values = [(index, value) for (index, value) in enumerate(row_array) if value != 0.0]

    # Restituisci la lista di coppie chiave-valore
    return doc_id, row_values,num_docs  


doc_id, row_values, num_docs  = get_row_values_tfidf(tfidf_matrix_docs,0)





### Map Function

In [110]:
def my_map(doc_id, row_values): 
    id_row_values = (doc_id,row_values)
    map_results = [(key, id_row_values) for (key, value) in row_values]
    return map_results
    

map_results = my_map(doc_id, row_values) 





[(2,
  (0,
   [(2, 0.04645688180567354),
    (13, 0.10234214535172403),
    (19, 0.09291376361134708),
    (24, 0.05117107267586202),
    (25, 0.05117107267586202),
    (37, 0.05117107267586202),
    (38, 0.05117107267586202),
    (53, 0.05117107267586202),
    (55, 0.040517703702157036),
    (56, 0.04645688180567354),
    (66, 0.04311211015368888),
    (70, 0.05117107267586202),
    (72, 0.05117107267586202),
    (75, 0.09291376361134708),
    (83, 0.15351321802758605),
    (96, 0.05117107267586202),
    (97, 0.05117107267586202),
    (98, 0.09291376361134708),
    (108, 0.04645688180567354),
    (109, 0.05117107267586202),
    (110, 0.05117107267586202),
    (113, 0.10234214535172403),
    (114, 0.04645688180567354),
    (117, 0.05117107267586202),
    (129, 0.05117107267586202),
    (166, 0.05117107267586202),
    (172, 0.04311211015368888),
    (177, 0.040517703702157036),
    (192, 0.031350605435416436),
    (221, 0.13068938924721452),
    (249, 0.05117107267586202),
    (272, 0.0

In [117]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

data = [(doc_id, row_values)]
rdd = sc.parallelize(data)
my_map_result_rdd = rdd.map(lambda x: my_map(x[0], x[1]))
my_map_result_rdd.collect()

PythonRDD[64] at RDD at PythonRDD.scala:53


### Reduce Function

In [None]:

def my_reduce(map_results):