# Documents Similarity

## Read Files

In [1]:
#path_name_documents = './Databases/prova/prova50.jsonl'
#path_name_documents = './Databases/prova/prova30000.jsonl'
path_name_documents = './Databases/prova/prova2000.jsonl'

In [2]:
import json
import numpy as np
import string

def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)


In [3]:
import time

def time_it(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Execution time: {end_time - start_time:.5f} seconds")
        return result
    return wrapper

## Tokenized

In [4]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs


tokenized_docs = tokenize(path_name_documents)


# Sparse Vectors

## TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


def calculateTFIDF(tokenized_docs):
    
    vectorizer = TfidfVectorizer()
    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
    
        

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)

## Cosine Similarity

In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def similarity(tfidf_matrix):
    # calcoliamo la cosine similarity tra i documenti
    cos_sim = cosine_similarity(tfidf_matrix)

    # creiamo una tabella con le cosine similarity per ogni coppia di documenti
    sim_table = pd.DataFrame(cos_sim, columns=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])], index=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])])
    
    return sim_table, cos_sim

cos_sim_table, cos_sim = similarity(tfidf_matrix_docs)
cos_sim_table


Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10,...,Doc 1991,Doc 1992,Doc 1993,Doc 1994,Doc 1995,Doc 1996,Doc 1997,Doc 1998,Doc 1999,Doc 2000
Doc 1,1.000000,0.088140,0.076180,0.041710,0.135476,0.037048,0.079712,0.083859,0.046190,0.066090,...,0.052495,0.091297,0.074900,0.077853,0.055493,0.056206,0.155063,0.047254,0.098762,0.059326
Doc 2,0.088140,1.000000,0.216128,0.149980,0.125037,0.082548,0.081243,0.109634,0.165225,0.113318,...,0.136774,0.053780,0.086105,0.117846,0.103599,0.098679,0.084447,0.076612,0.107566,0.091730
Doc 3,0.076180,0.216128,1.000000,0.112982,0.164268,0.056795,0.076408,0.092951,0.092219,0.082467,...,0.088033,0.051083,0.078266,0.097425,0.101088,0.104249,0.106681,0.069464,0.115833,0.086805
Doc 4,0.041710,0.149980,0.112982,1.000000,0.063064,0.033588,0.038800,0.051888,0.088788,0.060705,...,0.094869,0.032597,0.046291,0.042923,0.047268,0.044550,0.043281,0.057974,0.074586,0.058088
Doc 5,0.135476,0.125037,0.164268,0.063064,1.000000,0.057647,0.097862,0.090730,0.087590,0.084105,...,0.101804,0.060967,0.101729,0.085532,0.103503,0.093017,0.073879,0.071080,0.152637,0.081881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc 1996,0.056206,0.098679,0.104249,0.044550,0.093017,0.050826,0.054918,0.069194,0.051968,0.064365,...,0.058417,0.054911,0.065844,0.067180,0.158831,1.000000,0.065041,0.064478,0.089159,0.082480
Doc 1997,0.155063,0.084447,0.106681,0.043281,0.073879,0.040495,0.064603,0.072925,0.048578,0.068262,...,0.052454,0.066905,0.070492,0.057624,0.063972,0.065041,1.000000,0.050453,0.091349,0.063624
Doc 1998,0.047254,0.076612,0.069464,0.057974,0.071080,0.048398,0.043551,0.061967,0.047952,0.049643,...,0.055534,0.038722,0.048710,0.054898,0.062131,0.064478,0.050453,1.000000,0.099591,0.050479
Doc 1999,0.098762,0.107566,0.115833,0.074586,0.152637,0.153502,0.071356,0.092917,0.074554,0.097069,...,0.085829,0.070660,0.104924,0.115240,0.085430,0.089159,0.091349,0.099591,1.000000,0.091079


## Sequential algorithms

In [7]:
threshold = 0.2

### For each document create pairs of similar documents

In [25]:
@time_it
def sequential_pair_similar_documents(cos_sim, threshold):   
    # Creiamo una lista di coppie di documenti simili con un valore di similarità superiore al threshold
    sim_pairs = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                sim_pairs.append((i+1, j+1))
            
    return sim_pairs


sim_pairs = sequential_pair_similar_documents(cos_sim, threshold)
print(sim_pairs)

Execution time: 0.33400 seconds
[(1, 80), (1, 214), (1, 339), (1, 365), (1, 416), (1, 464), (1, 529), (1, 598), (1, 637), (1, 639), (1, 674), (1, 689), (1, 755), (1, 779), (1, 889), (1, 917), (1, 924), (1, 936), (1, 996), (1, 998), (1, 1015), (1, 1061), (1, 1071), (1, 1077), (1, 1085), (1, 1115), (1, 1146), (1, 1149), (1, 1175), (1, 1185), (1, 1197), (1, 1209), (1, 1229), (1, 1507), (1, 1513), (1, 1564), (1, 1574), (1, 1720), (1, 1745), (1, 1805), (1, 1929), (1, 1937), (2, 3), (2, 147), (2, 162), (2, 182), (2, 253), (2, 319), (2, 406), (2, 559), (2, 614), (2, 694), (2, 718), (2, 940), (2, 1348), (2, 1448), (2, 1531), (2, 1860), (3, 162), (3, 174), (3, 182), (3, 487), (3, 683), (3, 1629), (3, 1786), (5, 16), (5, 98), (5, 162), (5, 458), (5, 558), (5, 885), (5, 914), (5, 954), (5, 1117), (5, 1294), (5, 1606), (5, 1748), (5, 1860), (5, 1939), (6, 140), (6, 1686), (7, 668), (7, 1194), (7, 1680), (8, 178), (8, 397), (8, 591), (10, 130), (10, 136), (10, 237), (10, 240), (10, 604), (10, 1109)

In [28]:
@time_it
def sequential_pair_similar_documents2(cos_sim, threshold):
    # Trova gli indici delle celle che superano la soglia di similarità
    sim_idxs = np.argwhere(cos_sim > threshold)
    # Converte gli indici delle celle in coppie di documenti simili
    sim_pairs = [(i+1, j+1) for i, j in sim_idxs if i < j]

    return sim_pairs


sim_pairs = sequential_pair_similar_documents2(cos_sim, threshold)
print(sim_pairs)

Execution time: 0.04300 seconds
[(1, 80), (1, 214), (1, 339), (1, 365), (1, 416), (1, 464), (1, 529), (1, 598), (1, 637), (1, 639), (1, 674), (1, 689), (1, 755), (1, 779), (1, 889), (1, 917), (1, 924), (1, 936), (1, 996), (1, 998), (1, 1015), (1, 1061), (1, 1071), (1, 1077), (1, 1085), (1, 1115), (1, 1146), (1, 1149), (1, 1175), (1, 1185), (1, 1197), (1, 1209), (1, 1229), (1, 1507), (1, 1513), (1, 1564), (1, 1574), (1, 1720), (1, 1745), (1, 1805), (1, 1929), (1, 1937), (2, 3), (2, 147), (2, 162), (2, 182), (2, 253), (2, 319), (2, 406), (2, 559), (2, 614), (2, 694), (2, 718), (2, 940), (2, 1348), (2, 1448), (2, 1531), (2, 1860), (3, 162), (3, 174), (3, 182), (3, 487), (3, 683), (3, 1629), (3, 1786), (5, 16), (5, 98), (5, 162), (5, 458), (5, 558), (5, 885), (5, 914), (5, 954), (5, 1117), (5, 1294), (5, 1606), (5, 1748), (5, 1860), (5, 1939), (6, 140), (6, 1686), (7, 668), (7, 1194), (7, 1680), (8, 178), (8, 397), (8, 591), (10, 130), (10, 136), (10, 237), (10, 240), (10, 604), (10, 1109)

### For each document calculate the number of similar documents

In [9]:
@time_it
def sequential_count_similar_documents(cos_sim, threshold):
    num_similar = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        num = 0
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                num += 1
        num_similar.append(num)
    return num_similar, threshold

num_similar, threshold = sequential_count_similar_documents(cos_sim, threshold)
#print(num_similar)


Execution time: 0.33500 seconds


### For each document create a list with similar documents

In [10]:
@time_it
def sequential_similar_documents(cos_sim, threshold):
    similar_docs = []
    n_docs = cos_sim.shape[0]
    for i in range(n_docs):
        sim_docs = []
        for j in range(i+1, n_docs):
            if cos_sim[i,j] > threshold:
                sim_docs.append(f"Doc {j+1}")
        similar_docs.append(sim_docs)
    return similar_docs
similar_docs = sequential_similar_documents(cos_sim, threshold)


Execution time: 0.33700 seconds


### Table of final results

In [22]:
def create_similar_table(similar_docs, num_similar, threshold):
    doc_names = [f"Doc {i+1}" for i in range(len(num_similar))]
    similar_docs_str = [", ".join(docs) for docs in similar_docs]
    similar_table = pd.DataFrame({"Documents": doc_names, "Number of similar documents": num_similar, "Similar Documents": similar_docs_str, "Threshold": threshold})
    pd.set_option('display.max_rows', None)

    return similar_table

similar_table = create_similar_table(similar_docs, num_similar, threshold)
similar_table

Unnamed: 0,Documents,Number of similar documents,Similar Documents,Threshold
0,Doc 1,42,"Doc 80, Doc 214, Doc 339, Doc 365, Doc 416, Do...",0.2
1,Doc 2,16,"Doc 3, Doc 147, Doc 162, Doc 182, Doc 253, Doc...",0.2
2,Doc 3,7,"Doc 162, Doc 174, Doc 182, Doc 487, Doc 683, D...",0.2
3,Doc 4,0,,0.2
4,Doc 5,14,"Doc 16, Doc 98, Doc 162, Doc 458, Doc 558, Doc...",0.2
5,Doc 6,2,"Doc 140, Doc 1686",0.2
6,Doc 7,3,"Doc 668, Doc 1194, Doc 1680",0.2
7,Doc 8,3,"Doc 178, Doc 397, Doc 591",0.2
8,Doc 9,0,,0.2
9,Doc 10,11,"Doc 130, Doc 136, Doc 237, Doc 240, Doc 604, D...",0.2


# Prove Clustering

In [12]:
def divide_documents(cos_sim_table, threshold):
    # Creiamo un dizionario per i cluster
    clusters = {}
    # Creiamo un set per tenere traccia dei documenti già assegnati a un cluster
    assigned_docs = set()
    # Iteriamo su ogni riga e colonna della tabella di similarità
    for i, row in cos_sim_table.iterrows():
        if i not in assigned_docs:  # Se il documento non è già stato assegnato a un cluster
            cluster = []  # Creiamo un nuovo cluster
            for j, sim in row.iteritems():
                if j != i and j not in assigned_docs and sim >= threshold:  # Se il documento non è già stato assegnato a un cluster, non è se stesso e ha una similarità maggiore o uguale alla soglia
                    cluster.append(j)  # Aggiungiamo il documento al cluster
                    assigned_docs.add(j)  # Segniamo il documento come assegnato
            cluster.append(i)  # Aggiungiamo il documento corrente al cluster
            assigned_docs.add(i)  # Segniamo il documento corrente come assegnato
            clusters[f'Cluster {len(clusters)+1}'] = cluster  # Aggiungiamo il cluster al dizionario di cluster
    return clusters


clusters = divide_documents(cos_sim_table, 0.2)
#clusters


  for j, sim in row.iteritems():


In [13]:
def find_similar_documents(cos_sim_table, doc_name, similarity_threshold):
    # trova il vettore di similarità del documento specificato
    doc_sim_vector = cos_sim_table[doc_name]

    # trova il numero di documenti che hanno una similarità superiore alla soglia specificata
    similar_docs_count = len([similarity for similarity in doc_sim_vector if similarity > similarity_threshold])

    return similar_docs_count

similar_docs_count = find_similar_documents(cos_sim_table, 'Doc 1', 0.2)
print(similar_docs_count-1)

42


# Parallel Alghoritm 

In [14]:
import os
os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'


In [15]:
from pyspark.sql import SparkSession

#spark = SparkSession.builder.appName("document-similarity").getOrCreate()


In [16]:
path_name_documents = './Databases/prova/prova50.jsonl'

In [17]:
#conf = SparkConf().setAppName("MyApp")
#sc = SparkContext(conf=conf)

In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import SparseVector, DenseVector
from itertools import combinations
import json


def load_documents(spark, input_path):
    """
    Carica i documenti dal file JSONL e restituisce un RDD di Spark.
    Il file JSONL deve contenere un documento per riga.
    """


    # Leggi il file JSONL come RDD di Spark
    rdd = spark.sparkContext.textFile(input_path)

    
    # Estrai l'id e il testo di ogni documento
    documents = rdd.map(lambda line: json.loads(line)).map(lambda d: (d["_id"], d["text"]))
    return documents

#documents = load_documents(spark, path_name_documents)
#documents_list = documents.collect()
#print(documents_list)

In [19]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import SparseVector, DenseVector
from itertools import combinations
import json


def preprocess_document(document, hashingTF, idfModel):
    """
    Applica la pre-elaborazione a un singolo documento, includendo la rappresentazione HashingTF
    e la rappresentazione IDF.
    Restituisce un oggetto DenseVector che rappresenta il documento.
    """

    # Estra il testo dal documento
    text = document[1]

    # Crea una lista di tuple (indice, valore) per ogni parola nel testo
    word_counts = hashingTF.transform(text.split())
    word_tfidf = idfModel.transform(word_counts)
    words = [(i, v) for i, v in enumerate(word_tfidf.toArray()) if v != 0.0]

    # Crea un oggetto SparseVector che rappresenta il documento
    if len(words) == 0:
        return None
    else:
        return DenseVector([v for i, v in words])

def calculate_similarity(documents):
    """
    Calcola la cosine similarity tra tutte le coppie di documenti.
    Restituisce un RDD di tuple (id_doc_1, id_doc_2, cosine_similarity).
    """

    # Calcola la cosine similarity tra tutte le coppie di documenti
    document_pairs = combinations(documents.collect(), 2)
    similarities = []
    for doc1, doc2 in document_pairs:
        if doc1 is not None and doc2 is not None:
            cosine_sim = doc1.dot(doc2) / (doc1.norm(2) * doc2.norm(2))
            similarities.append((doc1[0], doc2[0], cosine_sim))

    return similarities


def main():

    # Carica i documenti dal file JSONL
    documents = load_documents(spark, path_name_documents)

    # Crea un oggetto HashingTF per la rappresentazione dei documenti come bag of words
    hashingTF = HashingTF(numFeatures=10000, inputCol="text", outputCol="word_count")

    # Calcola la rappresentazione IDF dei documenti
    word_counts = hashingTF.transform(documents)
    idf = IDF(inputCol="word_count", outputCol="features")
    idfModel = idf.fit(word_counts)

    # Pre-elabora tutti i documenti
    preprocessed_documents = documents.map(lambda d: (d[0], preprocess_document(d, hashingTF, idfModel))) \
                                       .filter(lambda d: d[1] is not None)

    # Calcola la cosine similarity tra tutte le coppie di documenti
    similarities = calculate_similarity(preprocessed_documents)



#main()


In [20]:
from pyspark import SparkConf
from pyspark import SparkContext

import os

def fu():
    os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
    os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'

    conf = SparkConf().setAppName("MyApp")
    sc = SparkContext(conf=conf)


    def sq(x):
        return x*x

    data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
    myrdd = sc.parallelize(data)
    squared = myrdd.map(sq) # this creates a new RDD
    print(squared.collect()) # careful: collect() is an action