# Documents Similarity

## Read Files

In [1]:
path_name_documents = './Databases/prova/prova50.jsonl'
#path_name_documents = './Databases/prova/prova30000.jsonl'
#path_name_documents = './Databases/prova/prova2000.jsonl'

In [2]:
import json
import numpy as np
import string

def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)


## Tokenized

In [3]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs


tokenized_docs = tokenize(path_name_documents)


# Sparse Vectors

## TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer


def calculateTFIDF(tokenized_docs):
    
    vectorizer = TfidfVectorizer()
    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
    
        

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)

## Cosine Similarity

In [29]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def similarity(tfidf_matrix):
    # calcoliamo la cosine similarity tra i documenti
    cos_sim = cosine_similarity(tfidf_matrix)

    # creiamo una tabella con le cosine similarity per ogni coppia di documenti
    sim_table = pd.DataFrame(cos_sim, columns=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])], index=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])])
    
    return sim_table, cos_sim

cos_sim_table, cos_sim = similarity(tfidf_matrix_docs)
cos_sim_table


Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10,...,Doc 50,Doc 51,Doc 52,Doc 53,Doc 54,Doc 55,Doc 56,Doc 57,Doc 58,Doc 59
Doc 1,1.0,0.116785,0.116682,0.065408,0.185082,0.063252,0.114855,0.131666,0.082319,0.10333,...,0.158555,0.113504,0.183942,0.0,0.0,0.0,0.0,0.0,0.108963,0.101689
Doc 2,0.116785,1.0,0.293539,0.222886,0.172103,0.138513,0.112282,0.165781,0.22244,0.171459,...,0.258898,0.114252,0.159238,0.0,0.0,0.0,0.0,0.0,0.161876,0.175653
Doc 3,0.116682,0.293539,1.0,0.181477,0.235975,0.105975,0.114559,0.155222,0.17226,0.145168,...,0.23306,0.160828,0.175988,0.0,0.0,0.0,0.0,0.0,0.185843,0.185424
Doc 4,0.065408,0.222886,0.181477,1.0,0.097654,0.06361,0.063011,0.093672,0.162503,0.115923,...,0.161168,0.089334,0.118018,0.0,0.0,0.0,0.0,0.0,0.094309,0.136627
Doc 5,0.185082,0.172103,0.235975,0.097654,1.0,0.096868,0.139804,0.1329,0.14999,0.128149,...,0.193689,0.128157,0.174898,0.0,0.0,0.0,0.0,0.0,0.179021,0.182024
Doc 6,0.063252,0.138513,0.105975,0.06361,0.096868,1.0,0.046924,0.094784,0.082026,0.094177,...,0.14122,0.067741,0.098841,0.0,0.0,0.0,0.0,0.0,0.125477,0.111974
Doc 7,0.114855,0.112282,0.114559,0.063011,0.139804,0.046924,1.0,0.135525,0.097209,0.099001,...,0.18463,0.084729,0.173973,0.0,0.0,0.0,0.0,0.0,0.090519,0.08134
Doc 8,0.131666,0.165781,0.155222,0.093672,0.1329,0.094784,0.135525,1.0,0.109231,0.147724,...,0.271639,0.089889,0.125835,0.0,0.0,0.0,0.0,0.0,0.152414,0.124312
Doc 9,0.082319,0.22244,0.17226,0.162503,0.14999,0.082026,0.097209,0.109231,1.0,0.118235,...,0.165711,0.099433,0.14594,0.0,0.0,0.0,0.0,0.0,0.139159,0.140329
Doc 10,0.10333,0.171459,0.145168,0.115923,0.128149,0.094177,0.099001,0.147724,0.118235,1.0,...,0.236668,0.102157,0.174861,0.0,0.0,0.0,0.0,0.0,0.16592,0.165466


## For each document calculate the number of documents with a certain thresholds

In [34]:
threshold = 0.2


### Sequential Algoritghm

In [35]:
def sequential_pair_similar_documents(cos_sim, threshold):
   
    # Creiamo una lista di coppie di documenti simili con un valore di similarità superiore al threshold
    sim_pairs = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                sim_pairs.append((i+1, j+1))
                
    return sim_pairs


sim_pairs = sequential_pair_similar_documents(cos_sim, threshold)
print(sim_pairs)

[(1, 43), (2, 3), (2, 4), (2, 9), (2, 30), (2, 33), (2, 34), (2, 36), (2, 50), (3, 5), (3, 16), (3, 30), (3, 34), (3, 40), (3, 50), (5, 16), (6, 36), (7, 17), (8, 50), (10, 16), (10, 30), (10, 32), (10, 33), (10, 34), (10, 49), (10, 50), (11, 28), (11, 31), (12, 20), (12, 30), (12, 36), (13, 20), (14, 16), (16, 17), (16, 20), (16, 21), (16, 27), (16, 32), (16, 33), (16, 34), (16, 36), (16, 46), (16, 50), (16, 52), (16, 58), (16, 59), (20, 23), (20, 27), (20, 30), (20, 36), (21, 32), (21, 34), (21, 50), (23, 36), (23, 39), (23, 58), (27, 36), (28, 31), (28, 36), (29, 39), (30, 36), (30, 41), (30, 50), (30, 58), (32, 34), (32, 42), (32, 49), (32, 50), (33, 34), (33, 50), (34, 42), (34, 44), (34, 49), (34, 50), (34, 52), (36, 50), (36, 59), (40, 50), (40, 59), (41, 58), (42, 49), (43, 52), (45, 50), (46, 58), (49, 50), (50, 52), (50, 59), (58, 59)]


In [36]:
def sequential_count_similar_documents(cos_sim_table, threshold):
    num_similar = []
    num_docs = cos_sim_table.shape[0]
    for i in range(num_docs):
        num = 0
        for j in range(i+1, num_docs):
            if cos_sim_table.iloc[i,j] > threshold:
                num += 1
        num_similar.append(num)
    return num_similar, threshold

num_similar, threshold = sequential_count_similar_documents(cos_sim_table, threshold)
print(num_similar)


[1, 8, 6, 0, 1, 1, 1, 1, 0, 7, 2, 3, 1, 1, 0, 13, 0, 0, 0, 4, 3, 0, 3, 0, 0, 0, 1, 2, 1, 4, 0, 4, 2, 5, 0, 2, 0, 0, 0, 2, 1, 1, 1, 0, 1, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [38]:
def sequential_similar_documents(cos_sim_table, threshold):
    similar_docs = []
    num_docs = cos_sim_table.shape[0]
    for i in range(num_docs):
        sim_docs = []
        for j in range(i+1, num_docs):
            if cos_sim_table.iloc[i,j] > threshold:
                sim_docs.append(f"Doc {j+1}")
        similar_docs.append(sim_docs)
    return similar_docs
similar_docs = sequential_similar_documents(cos_sim_table, threshold)
similar_docs

[['Doc 43'],
 ['Doc 3', 'Doc 4', 'Doc 9', 'Doc 30', 'Doc 33', 'Doc 34', 'Doc 36', 'Doc 50'],
 ['Doc 5', 'Doc 16', 'Doc 30', 'Doc 34', 'Doc 40', 'Doc 50'],
 [],
 ['Doc 16'],
 ['Doc 36'],
 ['Doc 17'],
 ['Doc 50'],
 [],
 ['Doc 16', 'Doc 30', 'Doc 32', 'Doc 33', 'Doc 34', 'Doc 49', 'Doc 50'],
 ['Doc 28', 'Doc 31'],
 ['Doc 20', 'Doc 30', 'Doc 36'],
 ['Doc 20'],
 ['Doc 16'],
 [],
 ['Doc 17',
  'Doc 20',
  'Doc 21',
  'Doc 27',
  'Doc 32',
  'Doc 33',
  'Doc 34',
  'Doc 36',
  'Doc 46',
  'Doc 50',
  'Doc 52',
  'Doc 58',
  'Doc 59'],
 [],
 [],
 [],
 ['Doc 23', 'Doc 27', 'Doc 30', 'Doc 36'],
 ['Doc 32', 'Doc 34', 'Doc 50'],
 [],
 ['Doc 36', 'Doc 39', 'Doc 58'],
 [],
 [],
 [],
 ['Doc 36'],
 ['Doc 31', 'Doc 36'],
 ['Doc 39'],
 ['Doc 36', 'Doc 41', 'Doc 50', 'Doc 58'],
 [],
 ['Doc 34', 'Doc 42', 'Doc 49', 'Doc 50'],
 ['Doc 34', 'Doc 50'],
 ['Doc 42', 'Doc 44', 'Doc 49', 'Doc 50', 'Doc 52'],
 [],
 ['Doc 50', 'Doc 59'],
 [],
 [],
 [],
 ['Doc 50', 'Doc 59'],
 ['Doc 58'],
 ['Doc 49'],
 ['Doc 52'],
 

In [39]:
import pandas as pd

def create_similar_table(similar_docs, num_similar, threshold):
    doc_names = [f"Doc {i+1}" for i in range(len(num_similar))]
    similar_table = pd.DataFrame({"Documents": doc_names, "Number of similar documents": num_similar, "Similar Documents": similar_docs, "Threshold": threshold})
    return similar_table

similar_table = create_similar_table(similar_docs, num_similar, threshold)
similar_table

Unnamed: 0,Documents,Number of similar documents,Similar Documents,Threshold
0,Doc 1,1,[Doc 43],0.2
1,Doc 2,8,"[Doc 3, Doc 4, Doc 9, Doc 30, Doc 33, Doc 34, ...",0.2
2,Doc 3,6,"[Doc 5, Doc 16, Doc 30, Doc 34, Doc 40, Doc 50]",0.2
3,Doc 4,0,[],0.2
4,Doc 5,1,[Doc 16],0.2
5,Doc 6,1,[Doc 36],0.2
6,Doc 7,1,[Doc 17],0.2
7,Doc 8,1,[Doc 50],0.2
8,Doc 9,0,[],0.2
9,Doc 10,7,"[Doc 16, Doc 30, Doc 32, Doc 33, Doc 34, Doc 4...",0.2


# Prove Clustering

In [None]:
def divide_documents(cos_sim_table, threshold):
    # Creiamo un dizionario per i cluster
    clusters = {}
    # Creiamo un set per tenere traccia dei documenti già assegnati a un cluster
    assigned_docs = set()
    # Iteriamo su ogni riga e colonna della tabella di similarità
    for i, row in cos_sim_table.iterrows():
        if i not in assigned_docs:  # Se il documento non è già stato assegnato a un cluster
            cluster = []  # Creiamo un nuovo cluster
            for j, sim in row.iteritems():
                if j != i and j not in assigned_docs and sim >= threshold:  # Se il documento non è già stato assegnato a un cluster, non è se stesso e ha una similarità maggiore o uguale alla soglia
                    cluster.append(j)  # Aggiungiamo il documento al cluster
                    assigned_docs.add(j)  # Segniamo il documento come assegnato
            cluster.append(i)  # Aggiungiamo il documento corrente al cluster
            assigned_docs.add(i)  # Segniamo il documento corrente come assegnato
            clusters[f'Cluster {len(clusters)+1}'] = cluster  # Aggiungiamo il cluster al dizionario di cluster
    return clusters


clusters = divide_documents(cos_sim_table, 0.2)
clusters


In [None]:
def find_similar_documents(cos_sim_table, doc_name, similarity_threshold):
    # trova il vettore di similarità del documento specificato
    doc_sim_vector = cos_sim_table[doc_name]

    # trova il numero di documenti che hanno una similarità superiore alla soglia specificata
    similar_docs_count = len([similarity for similarity in doc_sim_vector if similarity > similarity_threshold])

    return similar_docs_count

similar_docs_count = find_similar_documents(cos_sim_table, 'Doc 1', 0.2)
print(similar_docs_count-1)

# Parallel Alghoritm 

In [None]:
import os
os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("document-similarity").getOrCreate()


In [None]:
path_name_documents = './Databases/prova/prova50.jsonl'

In [None]:
#conf = SparkConf().setAppName("MyApp")
#sc = SparkContext(conf=conf)

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import SparseVector, DenseVector
from itertools import combinations
import json


def load_documents(spark, input_path):
    """
    Carica i documenti dal file JSONL e restituisce un RDD di Spark.
    Il file JSONL deve contenere un documento per riga.
    """


    # Leggi il file JSONL come RDD di Spark
    rdd = spark.sparkContext.textFile(input_path)

    
    # Estrai l'id e il testo di ogni documento
    documents = rdd.map(lambda line: json.loads(line)).map(lambda d: (d["_id"], d["text"]))
    return documents

documents = load_documents(spark, path_name_documents)
documents_list = documents.collect()
print(documents_list)

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import SparseVector, DenseVector
from itertools import combinations
import json


def preprocess_document(document, hashingTF, idfModel):
    """
    Applica la pre-elaborazione a un singolo documento, includendo la rappresentazione HashingTF
    e la rappresentazione IDF.
    Restituisce un oggetto DenseVector che rappresenta il documento.
    """

    # Estra il testo dal documento
    text = document[1]

    # Crea una lista di tuple (indice, valore) per ogni parola nel testo
    word_counts = hashingTF.transform(text.split())
    word_tfidf = idfModel.transform(word_counts)
    words = [(i, v) for i, v in enumerate(word_tfidf.toArray()) if v != 0.0]

    # Crea un oggetto SparseVector che rappresenta il documento
    if len(words) == 0:
        return None
    else:
        return DenseVector([v for i, v in words])

def calculate_similarity(documents):
    """
    Calcola la cosine similarity tra tutte le coppie di documenti.
    Restituisce un RDD di tuple (id_doc_1, id_doc_2, cosine_similarity).
    """

    # Calcola la cosine similarity tra tutte le coppie di documenti
    document_pairs = combinations(documents.collect(), 2)
    similarities = []
    for doc1, doc2 in document_pairs:
        if doc1 is not None and doc2 is not None:
            cosine_sim = doc1.dot(doc2) / (doc1.norm(2) * doc2.norm(2))
            similarities.append((doc1[0], doc2[0], cosine_sim))

    return similarities


def main():

    # Carica i documenti dal file JSONL
    documents = load_documents(spark, path_name_documents)

    # Crea un oggetto HashingTF per la rappresentazione dei documenti come bag of words
    hashingTF = HashingTF(numFeatures=10000, inputCol="text", outputCol="word_count")

    # Calcola la rappresentazione IDF dei documenti
    word_counts = hashingTF.transform(documents)
    idf = IDF(inputCol="word_count", outputCol="features")
    idfModel = idf.fit(word_counts)

    # Pre-elabora tutti i documenti
    preprocessed_documents = documents.map(lambda d: (d[0], preprocess_document(d, hashingTF, idfModel))) \
                                       .filter(lambda d: d[1] is not None)

    # Calcola la cosine similarity tra tutte le coppie di documenti
    similarities = calculate_similarity(preprocessed_documents)



main()


In [None]:
from pyspark import SparkConf
from pyspark import SparkContext

import os

os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'

conf = SparkConf().setAppName("MyApp")
sc = SparkContext(conf=conf)


def sq(x):
    return x*x

data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
myrdd = sc.parallelize(data)
squared = myrdd.map(sq) # this creates a new RDD
print(squared.collect()) # careful: collect() is an action