# Documents Similarity

## Read Files

In [3]:
path_name_documents = './Databases/prova/prova50.jsonl'
#path_name_documents = './Databases/prova/prova30000.jsonl'
#path_name_documents = './Databases/prova/prova2000.jsonl'

In [2]:
import json
import numpy as np
import string

def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)


## Tokenized

In [4]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs


tokenized_docs = tokenize(path_name_documents)


# Sparse Vectors

## TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


def calculateTFIDF(tokenized_docs):
    
    vectorizer = TfidfVectorizer()
    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
    
        

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)

## Cosine Similarity

In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def similarity(tfidf_matrix):
    # calcoliamo la cosine similarity tra i documenti
    cos_sim = cosine_similarity(tfidf_matrix)

    # creiamo una tabella con le cosine similarity per ogni coppia di documenti
    sim_table = pd.DataFrame(cos_sim, columns=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])], index=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])])
    
    return sim_table

cos_sim_table = similarity(tfidf_matrix_docs)
cos_sim_table


Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10,...,Doc 50,Doc 51,Doc 52,Doc 53,Doc 54,Doc 55,Doc 56,Doc 57,Doc 58,Doc 59
Doc 1,1.0,0.116785,0.116682,0.065408,0.185082,0.063252,0.114855,0.131666,0.082319,0.10333,...,0.158555,0.113504,0.183942,0.0,0.0,0.0,0.0,0.0,0.108963,0.101689
Doc 2,0.116785,1.0,0.293539,0.222886,0.172103,0.138513,0.112282,0.165781,0.22244,0.171459,...,0.258898,0.114252,0.159238,0.0,0.0,0.0,0.0,0.0,0.161876,0.175653
Doc 3,0.116682,0.293539,1.0,0.181477,0.235975,0.105975,0.114559,0.155222,0.17226,0.145168,...,0.23306,0.160828,0.175988,0.0,0.0,0.0,0.0,0.0,0.185843,0.185424
Doc 4,0.065408,0.222886,0.181477,1.0,0.097654,0.06361,0.063011,0.093672,0.162503,0.115923,...,0.161168,0.089334,0.118018,0.0,0.0,0.0,0.0,0.0,0.094309,0.136627
Doc 5,0.185082,0.172103,0.235975,0.097654,1.0,0.096868,0.139804,0.1329,0.14999,0.128149,...,0.193689,0.128157,0.174898,0.0,0.0,0.0,0.0,0.0,0.179021,0.182024
Doc 6,0.063252,0.138513,0.105975,0.06361,0.096868,1.0,0.046924,0.094784,0.082026,0.094177,...,0.14122,0.067741,0.098841,0.0,0.0,0.0,0.0,0.0,0.125477,0.111974
Doc 7,0.114855,0.112282,0.114559,0.063011,0.139804,0.046924,1.0,0.135525,0.097209,0.099001,...,0.18463,0.084729,0.173973,0.0,0.0,0.0,0.0,0.0,0.090519,0.08134
Doc 8,0.131666,0.165781,0.155222,0.093672,0.1329,0.094784,0.135525,1.0,0.109231,0.147724,...,0.271639,0.089889,0.125835,0.0,0.0,0.0,0.0,0.0,0.152414,0.124312
Doc 9,0.082319,0.22244,0.17226,0.162503,0.14999,0.082026,0.097209,0.109231,1.0,0.118235,...,0.165711,0.099433,0.14594,0.0,0.0,0.0,0.0,0.0,0.139159,0.140329
Doc 10,0.10333,0.171459,0.145168,0.115923,0.128149,0.094177,0.099001,0.147724,0.118235,1.0,...,0.236668,0.102157,0.174861,0.0,0.0,0.0,0.0,0.0,0.16592,0.165466


## For each document calculate the number of documents with a certain thresholds

### Sequential Algoritghm

In [7]:
def sequential_count_similar_documents(cos_sim_table, threshold):
    num_similar = []
    for i in range(cos_sim_table.shape[0]):
        num = 0
        for j in range(cos_sim_table.shape[1]):
            if i != j and cos_sim_table.iloc[i,j] > threshold:
                num += 1
        num_similar.append(num)
    return num_similar, threshold

num_similar, threshold = sequential_count_similar_documents(cos_sim_table, 0.2)
print(num_similar)


[1, 8, 7, 1, 2, 1, 1, 1, 1, 7, 2, 3, 1, 1, 0, 17, 2, 0, 0, 7, 4, 0, 4, 0, 0, 0, 3, 3, 1, 9, 2, 7, 5, 12, 0, 11, 0, 0, 2, 3, 2, 3, 2, 1, 1, 2, 0, 0, 5, 16, 0, 4, 0, 0, 0, 0, 0, 6, 5]


In [8]:
import pandas as pd

def create_similar_table(num_similar, threshold):
    doc_names = [f"Doc {i+1}" for i in range(len(num_similar))]
    similar_table = pd.DataFrame({"Documents": doc_names, "Number of similar documents": num_similar, "Threshold":threshold })
    return similar_table


similar_table = create_similar_table(num_similar, threshold)
similar_table

Unnamed: 0,Documents,Number of similar documents,Threshold
0,Doc 1,1,0.2
1,Doc 2,8,0.2
2,Doc 3,7,0.2
3,Doc 4,1,0.2
4,Doc 5,2,0.2
5,Doc 6,1,0.2
6,Doc 7,1,0.2
7,Doc 8,1,0.2
8,Doc 9,1,0.2
9,Doc 10,7,0.2


# Prove Clustering

In [9]:
def divide_documents(cos_sim_table, threshold):
    # Creiamo un dizionario per i cluster
    clusters = {}
    # Creiamo un set per tenere traccia dei documenti già assegnati a un cluster
    assigned_docs = set()
    # Iteriamo su ogni riga e colonna della tabella di similarità
    for i, row in cos_sim_table.iterrows():
        if i not in assigned_docs:  # Se il documento non è già stato assegnato a un cluster
            cluster = []  # Creiamo un nuovo cluster
            for j, sim in row.iteritems():
                if j != i and j not in assigned_docs and sim >= threshold:  # Se il documento non è già stato assegnato a un cluster, non è se stesso e ha una similarità maggiore o uguale alla soglia
                    cluster.append(j)  # Aggiungiamo il documento al cluster
                    assigned_docs.add(j)  # Segniamo il documento come assegnato
            cluster.append(i)  # Aggiungiamo il documento corrente al cluster
            assigned_docs.add(i)  # Segniamo il documento corrente come assegnato
            clusters[f'Cluster {len(clusters)+1}'] = cluster  # Aggiungiamo il cluster al dizionario di cluster
    return clusters


clusters = divide_documents(cos_sim_table, 0.2)
clusters


{'Cluster 1': ['Doc 43', 'Doc 1'],
 'Cluster 2': ['Doc 3',
  'Doc 4',
  'Doc 9',
  'Doc 30',
  'Doc 33',
  'Doc 34',
  'Doc 36',
  'Doc 50',
  'Doc 2'],
 'Cluster 3': ['Doc 16', 'Doc 5'],
 'Cluster 4': ['Doc 6'],
 'Cluster 5': ['Doc 17', 'Doc 7'],
 'Cluster 6': ['Doc 8'],
 'Cluster 7': ['Doc 32', 'Doc 49', 'Doc 10'],
 'Cluster 8': ['Doc 28', 'Doc 31', 'Doc 11'],
 'Cluster 9': ['Doc 20', 'Doc 12'],
 'Cluster 10': ['Doc 13'],
 'Cluster 11': ['Doc 14'],
 'Cluster 12': ['Doc 15'],
 'Cluster 13': ['Doc 18'],
 'Cluster 14': ['Doc 19'],
 'Cluster 15': ['Doc 21'],
 'Cluster 16': ['Doc 22'],
 'Cluster 17': ['Doc 39', 'Doc 58', 'Doc 23'],
 'Cluster 18': ['Doc 24'],
 'Cluster 19': ['Doc 25'],
 'Cluster 20': ['Doc 26'],
 'Cluster 21': ['Doc 27'],
 'Cluster 22': ['Doc 29'],
 'Cluster 23': ['Doc 35'],
 'Cluster 24': ['Doc 37'],
 'Cluster 25': ['Doc 38'],
 'Cluster 26': ['Doc 59', 'Doc 40'],
 'Cluster 27': ['Doc 41'],
 'Cluster 28': ['Doc 42'],
 'Cluster 29': ['Doc 44'],
 'Cluster 30': ['Doc 45'],
 '

In [10]:
def find_similar_documents(cos_sim_table, doc_name, similarity_threshold):
    # trova il vettore di similarità del documento specificato
    doc_sim_vector = cos_sim_table[doc_name]

    # trova il numero di documenti che hanno una similarità superiore alla soglia specificata
    similar_docs_count = len([similarity for similarity in doc_sim_vector if similarity > similarity_threshold])

    return similar_docs_count

similar_docs_count = find_similar_documents(cos_sim_table, 'Doc 1', 0.2)
print(similar_docs_count-1)

1


# Parallel Alghoritm 

In [1]:
import os
os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("document-similarity").getOrCreate()


In [3]:
path_name_documents = './Databases/prova/prova50.jsonl'

In [4]:
#conf = SparkConf().setAppName("MyApp")
#sc = SparkContext(conf=conf)

In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import SparseVector, DenseVector
from itertools import combinations
import json


def load_documents(spark, input_path):
    """
    Carica i documenti dal file JSONL e restituisce un RDD di Spark.
    Il file JSONL deve contenere un documento per riga.
    """


    # Leggi il file JSONL come RDD di Spark
    rdd = spark.sparkContext.textFile(input_path)

    
    # Estrai l'id e il testo di ogni documento
    documents = rdd.map(lambda line: json.loads(line)).map(lambda d: (d["_id"], d["text"]))
    return documents

documents = load_documents(spark, path_name_documents)
documents_list = documents.collect()
print(documents_list)

[('ug7v899j', 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract infections, and 2 (5%) with bronchiolitis. Cough (82.5%), fever (75%), and malaise

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import SparseVector, DenseVector
from itertools import combinations
import json


def preprocess_document(document, hashingTF, idfModel):
    """
    Applica la pre-elaborazione a un singolo documento, includendo la rappresentazione HashingTF
    e la rappresentazione IDF.
    Restituisce un oggetto DenseVector che rappresenta il documento.
    """

    # Estra il testo dal documento
    text = document[1]

    # Crea una lista di tuple (indice, valore) per ogni parola nel testo
    word_counts = hashingTF.transform(text.split())
    word_tfidf = idfModel.transform(word_counts)
    words = [(i, v) for i, v in enumerate(word_tfidf.toArray()) if v != 0.0]

    # Crea un oggetto SparseVector che rappresenta il documento
    if len(words) == 0:
        return None
    else:
        return DenseVector([v for i, v in words])

def calculate_similarity(documents):
    """
    Calcola la cosine similarity tra tutte le coppie di documenti.
    Restituisce un RDD di tuple (id_doc_1, id_doc_2, cosine_similarity).
    """

    # Calcola la cosine similarity tra tutte le coppie di documenti
    document_pairs = combinations(documents.collect(), 2)
    similarities = []
    for doc1, doc2 in document_pairs:
        if doc1 is not None and doc2 is not None:
            cosine_sim = doc1.dot(doc2) / (doc1.norm(2) * doc2.norm(2))
            similarities.append((doc1[0], doc2[0], cosine_sim))

    return similarities


def main():

    # Carica i documenti dal file JSONL
    documents = load_documents(spark, path_name_documents)

    # Crea un oggetto HashingTF per la rappresentazione dei documenti come bag of words
    hashingTF = HashingTF(numFeatures=10000, inputCol="text", outputCol="word_count")

    # Calcola la rappresentazione IDF dei documenti
    word_counts = hashingTF.transform(documents)
    idf = IDF(inputCol="word_count", outputCol="features")
    idfModel = idf.fit(word_counts)

    # Pre-elabora tutti i documenti
    preprocessed_documents = documents.map(lambda d: (d[0], preprocess_document(d, hashingTF, idfModel))) \
                                       .filter(lambda d: d[1] is not None)

    # Calcola la cosine similarity tra tutte le coppie di documenti
    similarities = calculate_similarity(preprocessed_documents)



main()


In [None]:
from pyspark import SparkConf
from pyspark import SparkContext

import os

os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'

conf = SparkConf().setAppName("MyApp")
sc = SparkContext(conf=conf)


def sq(x):
    return x*x

data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
myrdd = sc.parallelize(data)
squared = myrdd.map(sq) # this creates a new RDD
print(squared.collect()) # careful: collect() is an action