# Documents Similarity

## Read Files

In [1]:
path_name_documents = './Databases/prova/prova2.jsonl'
#path_name_documents = './Databases/prova/prova50.jsonl'
#path_name_documents = './Databases/prova/prova30000.jsonl'
#path_name_documents = './Databases/prova/prova2000.jsonl'

In [2]:
import json
import numpy as np
import string

def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)


In [3]:
import time

def time_it(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Execution time: {end_time - start_time:.5f} seconds")
        return result
    return wrapper

## Tokenized

In [4]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs


tokenized_docs = tokenize(path_name_documents)


# Sparse Vectors

## TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


def calculateTFIDF(tokenized_docs):
    
    vectorizer = TfidfVectorizer()
    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
    
        

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)

In [6]:
print(tfidf_matrix_docs)

  (0, 14)	0.2772169812630457
  (0, 0)	0.2772169812630457
  (0, 13)	0.19724216536132688
  (0, 10)	0.2772169812630457
  (0, 5)	0.2772169812630457
  (0, 1)	0.2772169812630457
  (0, 9)	0.2772169812630457
  (0, 18)	0.19724216536132688
  (0, 7)	0.2772169812630457
  (0, 17)	0.2772169812630457
  (0, 4)	0.2772169812630457
  (0, 16)	0.2772169812630457
  (0, 19)	0.2772169812630457
  (0, 12)	0.2772169812630457
  (1, 3)	0.35327776613855977
  (1, 6)	0.35327776613855977
  (1, 2)	0.35327776613855977
  (1, 20)	0.35327776613855977
  (1, 15)	0.35327776613855977
  (1, 8)	0.35327776613855977
  (1, 11)	0.35327776613855977
  (1, 13)	0.2513600402461016
  (1, 18)	0.2513600402461016


## Cosine Similarity

In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def similarity(tfidf_matrix):
    # calcoliamo la cosine similarity tra i documenti
    cos_sim = cosine_similarity(tfidf_matrix)

    # creiamo una tabella con le cosine similarity per ogni coppia di documenti
    sim_table = pd.DataFrame(cos_sim, columns=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])], index=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])])
    
    return sim_table, cos_sim

cos_sim_table, cos_sim = similarity(tfidf_matrix_docs)
cos_sim_table


Unnamed: 0,Doc 1,Doc 2
Doc 1,1.0,0.099158
Doc 2,0.099158,1.0


## Sequential algorithms

In [8]:
threshold = 0.2

### For each document create pairs of similar documents

In [9]:
@time_it
def sequential_pair_similar_documents(cos_sim, threshold):   
    # Creiamo una lista di coppie di documenti simili con un valore di similarità superiore al threshold
    sim_pairs = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                sim_pairs.append((i+1, j+1))
            
    return sim_pairs


#sim_pairs = sequential_pair_similar_documents(cos_sim, threshold)
#print(sim_pairs)

In [10]:
@time_it
def sequential_pair_similar_documents2(cos_sim, threshold):
    # Trova gli indici delle celle che superano la soglia di similarità
    sim_idxs = np.argwhere(cos_sim > threshold)
    # Converte gli indici delle celle in coppie di documenti simili
    sim_pairs = [(i+1, j+1) for i, j in sim_idxs if i < j]

    return sim_pairs


#sim_pairs = sequential_pair_similar_documents2(cos_sim, threshold)
#print(sim_pairs)

### For each document calculate the number of similar documents

In [11]:
@time_it
def sequential_count_similar_documents(cos_sim, threshold):
    num_similar = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        num = 0
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                num += 1
        num_similar.append(num)
    return num_similar, threshold

num_similar, threshold = sequential_count_similar_documents(cos_sim, threshold)
#print(num_similar)


Execution time: 0.00000 seconds


### For each document create a list with similar documents

In [12]:
@time_it
def sequential_similar_documents(cos_sim, threshold):
    similar_docs = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        sim_docs = []
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                sim_docs.append(f"Doc {j+1}")
        similar_docs.append(sim_docs)
    return similar_docs

similar_docs = sequential_similar_documents(cos_sim, threshold)


Execution time: 0.00000 seconds


### Table of final results

In [13]:
def create_similar_table(similar_docs, num_similar, threshold):
    doc_names = [f"Doc {i+1}" for i in range(len(num_similar))]
    similar_docs_str = [", ".join(docs) for docs in similar_docs]
    similar_table = pd.DataFrame({"Documents": doc_names, "Number of similar documents": num_similar, "Similar Documents": similar_docs_str, "Threshold": threshold})
    pd.set_option('display.max_rows', None)

    return similar_table

similar_table = create_similar_table(similar_docs, num_similar, threshold)
similar_table

Unnamed: 0,Documents,Number of similar documents,Similar Documents,Threshold
0,Doc 1,0,,0.2
1,Doc 2,0,,0.2


# Prove Clustering

In [14]:
def divide_documents(cos_sim_table, threshold):
    # Creiamo un dizionario per i cluster
    clusters = {}
    # Creiamo un set per tenere traccia dei documenti già assegnati a un cluster
    assigned_docs = set()
    # Iteriamo su ogni riga e colonna della tabella di similarità
    for i, row in cos_sim_table.iterrows():
        if i not in assigned_docs:  # Se il documento non è già stato assegnato a un cluster
            cluster = []  # Creiamo un nuovo cluster
            for j, sim in row.iteritems():
                if j != i and j not in assigned_docs and sim >= threshold:  # Se il documento non è già stato assegnato a un cluster, non è se stesso e ha una similarità maggiore o uguale alla soglia
                    cluster.append(j)  # Aggiungiamo il documento al cluster
                    assigned_docs.add(j)  # Segniamo il documento come assegnato
            cluster.append(i)  # Aggiungiamo il documento corrente al cluster
            assigned_docs.add(i)  # Segniamo il documento corrente come assegnato
            clusters[f'Cluster {len(clusters)+1}'] = cluster  # Aggiungiamo il cluster al dizionario di cluster
    return clusters


#clusters = divide_documents(cos_sim_table, 0.2)
#clusters


In [15]:
def find_similar_documents(cos_sim_table, doc_name, similarity_threshold):
    # trova il vettore di similarità del documento specificato
    doc_sim_vector = cos_sim_table[doc_name]

    # trova il numero di documenti che hanno una similarità superiore alla soglia specificata
    similar_docs_count = len([similarity for similarity in doc_sim_vector if similarity > similarity_threshold])

    return similar_docs_count

#similar_docs_count = find_similar_documents(cos_sim_table, 'Doc 1', 0.2)
#print(similar_docs_count-1)

# Parallel Alghoritm 

In [16]:
import os
os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'


In [17]:
path_name_documents = './Databases/prova/prova50.jsonl'

In [18]:
num_docs = tfidf_matrix_docs.shape[0]

### Take only one raw

In [52]:
def get_row_values_tfidf(tfidf_matrix, doc_id):
    
    num_docs = tfidf_matrix.shape[0]
    
    # Estrai la riga specificata dalla matrice TF-IDF come un array densamente popolato
    row_array = tfidf_matrix.toarray()[doc_id]

    # Creazione della lista di coppie chiave-valore
    row_values = [(index, value) for (index, value) in enumerate(row_array) if value != 0.0]

    # Restituisci la lista di coppie chiave-valore
    return doc_id, row_values,num_docs  


doc_id, row_values, num_docs  = get_row_values_tfidf(tfidf_matrix_docs,0)
row_values





[(0, 0.2772169812630457),
 (1, 0.2772169812630457),
 (4, 0.2772169812630457),
 (5, 0.2772169812630457),
 (7, 0.2772169812630457),
 (9, 0.2772169812630457),
 (10, 0.2772169812630457),
 (12, 0.2772169812630457),
 (13, 0.19724216536132688),
 (14, 0.2772169812630457),
 (16, 0.2772169812630457),
 (17, 0.2772169812630457),
 (18, 0.19724216536132688),
 (19, 0.2772169812630457)]

### Take all raws

In [112]:
def get_all_row_values_tfidf(tfidf_matrix):
    num_docs = tfidf_matrix.shape[0]
    row_values_list = []
    for doc_id in range(num_docs):
        row_array = tfidf_matrix.toarray()[doc_id]
        row_values = [(index, value) for (index, value) in enumerate(row_array) if value != 0.0]
        row_values_list.append((doc_id, row_values))
    return row_values_list


all_rows_values_list = get_all_row_values_tfidf(tfidf_matrix_docs)
#all_rows_values_list

### Map Function

In [58]:
def my_map(doc_id, row_values):
    id_row_values = (doc_id, row_values)
    map_results = [(key, id_row_values) for (key, value) in row_values]
    return map_results

map_results_raw = my_map(doc_id, row_values) 
map_results_raw

[(0,
  (0,
   [(0, 0.2772169812630457),
    (1, 0.2772169812630457),
    (4, 0.2772169812630457),
    (5, 0.2772169812630457),
    (7, 0.2772169812630457),
    (9, 0.2772169812630457),
    (10, 0.2772169812630457),
    (12, 0.2772169812630457),
    (13, 0.19724216536132688),
    (14, 0.2772169812630457),
    (16, 0.2772169812630457),
    (17, 0.2772169812630457),
    (18, 0.19724216536132688),
    (19, 0.2772169812630457)])),
 (1,
  (0,
   [(0, 0.2772169812630457),
    (1, 0.2772169812630457),
    (4, 0.2772169812630457),
    (5, 0.2772169812630457),
    (7, 0.2772169812630457),
    (9, 0.2772169812630457),
    (10, 0.2772169812630457),
    (12, 0.2772169812630457),
    (13, 0.19724216536132688),
    (14, 0.2772169812630457),
    (16, 0.2772169812630457),
    (17, 0.2772169812630457),
    (18, 0.19724216536132688),
    (19, 0.2772169812630457)])),
 (4,
  (0,
   [(0, 0.2772169812630457),
    (1, 0.2772169812630457),
    (4, 0.2772169812630457),
    (5, 0.2772169812630457),
    (7, 0.277

In [59]:

def execution_map(num_docs,tfidf_matrix):
    map_results = []
    for i in range(0,num_docs):
        doc_id, row_values, num_docs  = get_row_values_tfidf(tfidf_matrix_docs,i)
        map_results.append(my_map(doc_id, row_values))
    return map_results

map_results = execution_map(num_docs,tfidf_matrix_docs)    
map_results

[[(0,
   (0,
    [(0, 0.2772169812630457),
     (1, 0.2772169812630457),
     (4, 0.2772169812630457),
     (5, 0.2772169812630457),
     (7, 0.2772169812630457),
     (9, 0.2772169812630457),
     (10, 0.2772169812630457),
     (12, 0.2772169812630457),
     (13, 0.19724216536132688),
     (14, 0.2772169812630457),
     (16, 0.2772169812630457),
     (17, 0.2772169812630457),
     (18, 0.19724216536132688),
     (19, 0.2772169812630457)])),
  (1,
   (0,
    [(0, 0.2772169812630457),
     (1, 0.2772169812630457),
     (4, 0.2772169812630457),
     (5, 0.2772169812630457),
     (7, 0.2772169812630457),
     (9, 0.2772169812630457),
     (10, 0.2772169812630457),
     (12, 0.2772169812630457),
     (13, 0.19724216536132688),
     (14, 0.2772169812630457),
     (16, 0.2772169812630457),
     (17, 0.2772169812630457),
     (18, 0.19724216536132688),
     (19, 0.2772169812630457)])),
  (4,
   (0,
    [(0, 0.2772169812630457),
     (1, 0.2772169812630457),
     (4, 0.2772169812630457),
     

20


### Reduce Function

In [110]:
def my_reduce(t, doc_list, threshold):
    max_overlap = 0
    similar_docs = []
    
    for id1, d1 in doc_list:
        for id2, d2 in doc_list:
            if id1 != id2:  # Evita di confrontare un documento con se stesso
                overlap = len(set(d1) & set(d2))
                if overlap > max_overlap:
                    max_overlap = overlap
                    similarity = calculate_similarity(d1, d2)
                    if similarity >= threshold:
                        similar_docs = [[id1, id2, similarity]]
                    else:
                        similar_docs = []
                elif overlap == max_overlap:
                    similarity = calculate_similarity(d1, d2)
                    if similarity >= threshold:
                        similar_docs.append([id1, id2, similarity])

    return similar_docs


In [113]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(doc1, doc2):
    # Calcola la similarità tra i documenti d1 e d2 utilizzando la similarità del coseno
    
    # Trasforma i documenti in vettori sparsi
    vec1 = np.array(doc1)
    vec2 = np.array(doc2)
    
    # Calcola la similarità del coseno tra i due vettori
    similarity = cosine_similarity([vec1], [vec2])[0][0]
    
    return similarity


In [123]:
t = 2
doc_list = [
    [1, [0, 0, 1, 0]],
    [2, [2, 0, 0, 3]],
    [3, [0, 4, 0, 5]],
    [4, [0, 4, 0, 5]]
]
threshold = 0.2

result = my_reduce(t, doc_list, threshold)
print(result)

[[3, 4, 1.0], [4, 3, 1.0]]


In [55]:
from pyspark import SparkContext

# Definizione dei dati di input

sc = SparkContext.getOrCreate()

# Creazione dell'RDD
data = get_all_row_values_tfidf(tfidf_matrix_docs)
rdd = sc.parallelize(data)

# Applicazione della funzione my_map sull'RDD
my_map_result_rdd = rdd.flatMap(lambda x: my_map(x[0], x[1]))
my_map_result_rdd.collect()


[(0,
  (0,
   [(0, 0.2772169812630457),
    (1, 0.2772169812630457),
    (4, 0.2772169812630457),
    (5, 0.2772169812630457),
    (7, 0.2772169812630457),
    (9, 0.2772169812630457),
    (10, 0.2772169812630457),
    (12, 0.2772169812630457),
    (13, 0.19724216536132688),
    (14, 0.2772169812630457),
    (16, 0.2772169812630457),
    (17, 0.2772169812630457),
    (18, 0.19724216536132688),
    (19, 0.2772169812630457)])),
 (1,
  (0,
   [(0, 0.2772169812630457),
    (1, 0.2772169812630457),
    (4, 0.2772169812630457),
    (5, 0.2772169812630457),
    (7, 0.2772169812630457),
    (9, 0.2772169812630457),
    (10, 0.2772169812630457),
    (12, 0.2772169812630457),
    (13, 0.19724216536132688),
    (14, 0.2772169812630457),
    (16, 0.2772169812630457),
    (17, 0.2772169812630457),
    (18, 0.19724216536132688),
    (19, 0.2772169812630457)])),
 (4,
  (0,
   [(0, 0.2772169812630457),
    (1, 0.2772169812630457),
    (4, 0.2772169812630457),
    (5, 0.2772169812630457),
    (7, 0.277

In [82]:
def my_reduce(t, values):
    output = []
    for id1, d1 in values:
        for id2, d2 in values:
            if id1 != id2:
                intersection = set(d1) & set(d2)
                max_term = max(intersection) if intersection else None
                if max_term == t:
                    similarity = sim(d1, d2)
                    if similarity >= threshold:
                        output.append([id1, id2, similarity])
    return output



2


In [50]:
def b(document):
    max_term = max(document, key=lambda x: x[0])
    return max_term[0]

def Map(doc_id, document):
    output = []
    for term, score in document:
        if term > b(document):
            output.append((term, [doc_id, document]))
    return output

def Reduce(t, values):
    output = []
    for id1, d1 in values:
        for id2, d2 in values:
            if id1 != id2:
                intersection = set(d1) & set(d2)
                max_term = max(intersection) if intersection else None
                if max_term == t:
                    similarity = sim(d1, d2)
                    if similarity >= threshold:
                        output.append([id1, id2, similarity])
    return output

# Esempio di utilizzo
input_data = [
    (0, [(0, 0.2772169812630457), (1, 0.2772169812630457)]),
    (1, [(2, 0.35327776613855977), (3, 0.35327776613855977)])
]

output_map = []
output_reduce = []

for doc_id, document in input_data:
    print(doc_id)
    print(document)
    output_map.extend(Map(doc_id, document))

for term, values in output_map:
    output_reduce.extend(Reduce(term, values))

print(output_reduce)



0
[(0, 0.2772169812630457), (1, 0.2772169812630457)]
1
[(2, 0.35327776613855977), (3, 0.35327776613855977)]
[]
