# Documents Similarity

## Read Files

In [1]:
path_name_documents = './Databases/prova/prova50.jsonl'

In [2]:
import json
import numpy as np
import string

def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)


## Tokenized

In [3]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs


tokenized_docs = tokenize(path_name_documents)


In [4]:
print(tokenized_docs)

[['object', 'thi', 'retrospect', 'chart', 'review', 'describ', 'the', 'epidemiolog', 'and', 'clinic', 'featur', 'of', '40', 'patient', 'with', 'cultureproven', 'mycoplasma', 'pneumonia', 'infect', 'at', 'king', 'abdulaziz', 'univers', 'hospit', 'jeddah', 'saudi', 'arabia', 'method', 'patient', 'with', 'posit', 'm', 'pneumonia', 'cultur', 'from', 'respiratori', 'specimen', 'from', 'januari', '1997', 'through', 'decemb', '1998', 'were', 'identifi', 'through', 'the', 'microbiolog', 'record', 'chart', 'of', 'patient', 'were', 'review', 'result', '40', 'patient', 'were', 'identifi', '33', '825', 'of', 'whom', 'requir', 'admiss', 'most', 'infect', '925', 'were', 'communityacquir', 'the', 'infect', 'affect', 'all', 'age', 'group', 'but', 'wa', 'most', 'common', 'in', 'infant', '325', 'and', 'preschool', 'child', '225', 'it', 'occur', 'yearround', 'but', 'wa', 'most', 'common', 'in', 'the', 'fall', '35', 'and', 'spring', '30', 'more', 'than', 'threequart', 'of', 'patient', '775', 'had', 'comor

# Sparse Vectors

## TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


def calculateTFIDF(tokenized_docs):
    
    vectorizer = TfidfVectorizer()
    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
    
        

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)

## CosineSimilarity

In [23]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def similarity(tfidf_matrix):
    # calcoliamo la cosine similarity tra i documenti
    cos_sim = cosine_similarity(tfidf_matrix)

    # creiamo una tabella con le cosine similarity per ogni coppia di documenti
    sim_table = pd.DataFrame(cos_sim, columns=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])], index=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])])
    
    return sim_table

cos_sim_table = similarity(tfidf_matrix_docs)
cos_sim_table


Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10,...,Doc 50,Doc 51,Doc 52,Doc 53,Doc 54,Doc 55,Doc 56,Doc 57,Doc 58,Doc 59
Doc 1,1.0,0.116785,0.116682,0.065408,0.185082,0.063252,0.114855,0.131666,0.082319,0.10333,...,0.158555,0.113504,0.183942,0.0,0.0,0.0,0.0,0.0,0.108963,0.101689
Doc 2,0.116785,1.0,0.293539,0.222886,0.172103,0.138513,0.112282,0.165781,0.22244,0.171459,...,0.258898,0.114252,0.159238,0.0,0.0,0.0,0.0,0.0,0.161876,0.175653
Doc 3,0.116682,0.293539,1.0,0.181477,0.235975,0.105975,0.114559,0.155222,0.17226,0.145168,...,0.23306,0.160828,0.175988,0.0,0.0,0.0,0.0,0.0,0.185843,0.185424
Doc 4,0.065408,0.222886,0.181477,1.0,0.097654,0.06361,0.063011,0.093672,0.162503,0.115923,...,0.161168,0.089334,0.118018,0.0,0.0,0.0,0.0,0.0,0.094309,0.136627
Doc 5,0.185082,0.172103,0.235975,0.097654,1.0,0.096868,0.139804,0.1329,0.14999,0.128149,...,0.193689,0.128157,0.174898,0.0,0.0,0.0,0.0,0.0,0.179021,0.182024
Doc 6,0.063252,0.138513,0.105975,0.06361,0.096868,1.0,0.046924,0.094784,0.082026,0.094177,...,0.14122,0.067741,0.098841,0.0,0.0,0.0,0.0,0.0,0.125477,0.111974
Doc 7,0.114855,0.112282,0.114559,0.063011,0.139804,0.046924,1.0,0.135525,0.097209,0.099001,...,0.18463,0.084729,0.173973,0.0,0.0,0.0,0.0,0.0,0.090519,0.08134
Doc 8,0.131666,0.165781,0.155222,0.093672,0.1329,0.094784,0.135525,1.0,0.109231,0.147724,...,0.271639,0.089889,0.125835,0.0,0.0,0.0,0.0,0.0,0.152414,0.124312
Doc 9,0.082319,0.22244,0.17226,0.162503,0.14999,0.082026,0.097209,0.109231,1.0,0.118235,...,0.165711,0.099433,0.14594,0.0,0.0,0.0,0.0,0.0,0.139159,0.140329
Doc 10,0.10333,0.171459,0.145168,0.115923,0.128149,0.094177,0.099001,0.147724,0.118235,1.0,...,0.236668,0.102157,0.174861,0.0,0.0,0.0,0.0,0.0,0.16592,0.165466


## For each document calculate the number of documents with a certain thresholds

### Sequential Algoritghm

In [68]:
def sequential_count_similar_documents(cos_sim_table, threshold):
    num_similar = []
    for i in range(cos_sim_table.shape[0]):
        num = 0
        for j in range(cos_sim_table.shape[1]):
            if i != j and cos_sim_table.iloc[i,j] > threshold:
                num += 1
        num_similar.append(num)
    return num_similar, threshold

num_similar, threshold = sequential_count_similar_documents(cos_sim_table, 0.2)



In [69]:
import pandas as pd

def create_similar_table(num_similar, threshold):
    doc_names = [f"Doc {i+1}" for i in range(len(num_similar))]
    similar_table = pd.DataFrame({"Documents": doc_names, "Number of similar documents": num_similar, "Threshold":threshold })
    return similar_table


similar_table = create_similar_table(num_similar, threshold)
similar_table

Unnamed: 0,Documents,Number of similar documents,Threshold
0,Doc 1,1,0.2
1,Doc 2,8,0.2
2,Doc 3,7,0.2
3,Doc 4,1,0.2
4,Doc 5,2,0.2
5,Doc 6,1,0.2
6,Doc 7,1,0.2
7,Doc 8,1,0.2
8,Doc 9,1,0.2
9,Doc 10,7,0.2


# Prove Clustering

In [67]:
from scipy.cluster.hierarchy import linkage, fcluster
import pandas as pd

def cluster_documents(cos_sim_table, threshold, method='ward'):
    # Creiamo la matrice di linkage
    linkage_matrix = linkage(cos_sim_table, method=method)

    # Utilizziamo la matrice di linkage e la soglia per generare i cluster
    clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')

    # Creiamo un dataframe che contenga i documenti e il cluster a cui appartengono
    df_clusters = pd.DataFrame({'documento': cos_sim_table.index, 'cluster': clusters})

    # Ordiniamo il dataframe per cluster e documento
    df_clusters = df_clusters.sort_values(by=['cluster', 'documento'])

    return df_clusters

df_clusters = cluster_documents(cos_sim_table, 0.9, method='ward')
df_clusters

Unnamed: 0,documento,cluster
14,Doc 15,1
52,Doc 53,1
53,Doc 54,1
54,Doc 55,1
55,Doc 56,1
56,Doc 57,1
31,Doc 32,2
48,Doc 49,3
33,Doc 34,4
49,Doc 50,5


In [60]:
def dic_clusters(df_clusters):
    # Creiamo un dizionario di cluster, dove la chiave è il numero del cluster e il valore è una lista di documenti
    cluster_dict = {}
    for cluster_num in df_clusters['cluster'].unique():
        cluster_dict[cluster_num] = list(df_clusters[df_clusters['cluster']==cluster_num]['documento'])

    return cluster_dict

cluster_dict = dic_clusters(df_clusters)
cluster_dict

{1: ['Doc 15', 'Doc 53', 'Doc 54', 'Doc 55', 'Doc 56', 'Doc 57'],
 2: ['Doc 32'],
 3: ['Doc 49'],
 4: ['Doc 34'],
 5: ['Doc 50'],
 6: ['Doc 42'],
 7: ['Doc 44'],
 8: ['Doc 1'],
 9: ['Doc 43'],
 10: ['Doc 17'],
 11: ['Doc 52'],
 12: ['Doc 7'],
 13: ['Doc 19'],
 14: ['Doc 51'],
 15: ['Doc 26'],
 16: ['Doc 18'],
 17: ['Doc 38'],
 18: ['Doc 5'],
 19: ['Doc 16'],
 20: ['Doc 2'],
 21: ['Doc 3'],
 22: ['Doc 40'],
 23: ['Doc 59'],
 24: ['Doc 4'],
 25: ['Doc 9'],
 26: ['Doc 10'],
 27: ['Doc 33'],
 28: ['Doc 21'],
 29: ['Doc 47'],
 30: ['Doc 13'],
 31: ['Doc 20'],
 32: ['Doc 14'],
 33: ['Doc 22'],
 34: ['Doc 48'],
 35: ['Doc 23'],
 36: ['Doc 39'],
 37: ['Doc 29'],
 38: ['Doc 46'],
 39: ['Doc 24'],
 40: ['Doc 41'],
 41: ['Doc 58'],
 42: ['Doc 25'],
 43: ['Doc 35'],
 44: ['Doc 37'],
 45: ['Doc 8'],
 46: ['Doc 45'],
 47: ['Doc 11'],
 48: ['Doc 31'],
 49: ['Doc 28'],
 50: ['Doc 27', 'Doc 36'],
 51: ['Doc 12'],
 52: ['Doc 30'],
 53: ['Doc 6']}