# Documents Similarity

## Read Files

In [1]:
#path_name_documents = './Databases/prova/prova.jsonl'
#path_name_documents = './Databases/prova/prova10.jsonl'
path_name_documents = './Databases/prova/prova50.jsonl'
#path_name_documents = './Databases/prova/prova5000.jsonl' 
#path_name_documents = './Databases/prova/prova30000.jsonl'
#path_name_documents = './Databases/prova/prova2000.jsonl'

In [2]:
import json
import numpy as np
import string

def readFile(path_name):
    # Load the JSONL file into a list
    with open(path_name, 'r') as f:
        lines = f.readlines()

    # Convert each JSON object into a dictionary
    dicts = [json.loads(line) for line in lines]

    # Convert the dictionaries into arrays and stack them vertically
    arrays = np.vstack([np.array(list(d.values())) for d in dicts])

    # Convert the arrays into a list of lists
    text = arrays.tolist()
    
    return text

documents = readFile(path_name_documents)


In [3]:
import time

def time_it(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Execution time: {end_time - start_time:.5f} seconds")
        return result
    return wrapper

## Tokenized

In [4]:
import json
import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


stop_words = set(stopwords.words('english'))

def stemmingLemming(filtered_tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming or lemmatization on filtered tokens
    
    filtered_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    filtered_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return filtered_tokens
    
 
    

def tokenize(path_name):
    
    with open(path_name, "r") as f:
        data = f.readlines()

        # Create an empty list to store the tokenized documents
        tokenized_docs = []

        # Loop through each line in the JSONL file
        for line in data:
            # Parse the JSON string into a Python dictionary
            doc = json.loads(line)

            # Extract the text from the dictionary
            text = doc['text']
            text = text.lower()  # Convert to lowercase
            #text = re.sub(r'\d+', '', text)  # Remove all numbers
            text = text.translate(str.maketrans('', '', string.punctuation))  # Remove all punctuation

            # Tokenize the text using NLTK
            tokens = word_tokenize(text)
            tokensStemLem = stemmingLemming(tokens)

            # Add the tokenized document to the list
            tokenized_docs.append(tokensStemLem)

        # Print the tokenized documents
    return tokenized_docs


tokenized_docs = tokenize(path_name_documents)


# Sparse Vectors

## TF-IDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer


def calculateTFIDF(tokenized_docs):
    
    vectorizer = TfidfVectorizer()
    # Fit and transform the tokenized documents into a TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in tokenized_docs])

    # Get the feature names (tokens)
    feature_names = vectorizer.get_feature_names_out()

    # Return the TF-IDF matrix and the feature names
    return tfidf_matrix, feature_names,vectorizer
    
        

tfidf_matrix_docs, feature_names_docs,vectorizer  = calculateTFIDF(tokenized_docs)

## Cosine Similarity

In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def similarity(tfidf_matrix):
    # calcoliamo la cosine similarity tra i documenti
    cos_sim = cosine_similarity(tfidf_matrix)

    # creiamo una tabella con le cosine similarity per ogni coppia di documenti
    sim_table = pd.DataFrame(cos_sim, columns=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])], index=['Doc ' + str(i+1) for i in range(cos_sim.shape[0])])
    
    return sim_table, cos_sim

cos_sim_table, cos_sim = similarity(tfidf_matrix_docs)
cos_sim_table


Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10
Doc 1,1.0,0.147732,0.169516,0.095925,0.202873,0.091815,0.152647,0.21087,0.131717,0.141807
Doc 2,0.147732,1.0,0.352289,0.24431,0.213008,0.193674,0.152062,0.226987,0.270377,0.21899
Doc 3,0.169516,0.352289,1.0,0.236616,0.312709,0.166103,0.176712,0.246031,0.263255,0.206556
Doc 4,0.095925,0.24431,0.236616,1.0,0.132364,0.096492,0.091451,0.141086,0.207767,0.159223
Doc 5,0.202873,0.213008,0.312709,0.132364,1.0,0.142466,0.177512,0.168843,0.204114,0.165182
Doc 6,0.091815,0.193674,0.166103,0.096492,0.142466,1.0,0.081061,0.139366,0.131576,0.137394
Doc 7,0.152647,0.152062,0.176712,0.091451,0.177512,0.081061,1.0,0.156811,0.148365,0.140428
Doc 8,0.21087,0.226987,0.246031,0.141086,0.168843,0.139366,0.156811,1.0,0.171873,0.194598
Doc 9,0.131717,0.270377,0.263255,0.207767,0.204114,0.131576,0.148365,0.171873,1.0,0.179779
Doc 10,0.141807,0.21899,0.206556,0.159223,0.165182,0.137394,0.140428,0.194598,0.179779,1.0


## Sequential algorithms

In [7]:
threshold = 0.1

In [8]:
from scipy.sparse import coo_matrix

def extract_document_terms(tfidf_matrix):
    matrix_coo = coo_matrix(tfidf_matrix)
    data = matrix_coo.data
    row = matrix_coo.row
    col = matrix_coo.col

    doc_terms = []
    current_doc = -1
    terms = []

    for i in range(len(data)):
        doc_id = row[i]
        term_id = col[i]
        term_value = data[i]

        if doc_id != current_doc:
            if terms:
                doc_terms.append((current_doc+1, terms))
                terms = []
            current_doc = doc_id

        terms.append((term_id, term_value))

    if terms:
        doc_terms.append((current_doc, terms))

    return doc_terms

doc_info_list = extract_document_terms(tfidf_matrix_docs)
#print(doc_info_list)

[(1, [(263, 0.03498995282435991), (458, 0.0411602263846608), (358, 0.03498995282435991), (563, 0.03333635031782873), (236, 0.0411602263846608), (239, 0.027216327480976066), (223, 0.0411602263846608), (170, 0.03498995282435991), (453, 0.0411602263846608), (516, 0.03498995282435991), (400, 0.030612074062599096), (146, 0.0411602263846608), (399, 0.024441800502298213), (624, 0.0411602263846608), (591, 0.0411602263846608), (197, 0.03498995282435991), (90, 0.03498995282435991), (184, 0.1234806791539824), (2, 0.0823204527693216), (141, 0.0411602263846608), (520, 0.0411602263846608), (238, 0.0411602263846608), (470, 0.0411602263846608), (5, 0.0411602263846608), (471, 0.0411602263846608), (594, 0.0411602263846608), (19, 0.0411602263846608), (12, 0.0411602263846608), (0, 0.0411602263846608), (4, 0.0411602263846608), (607, 0.0411602263846608), (26, 0.0411602263846608), (434, 0.03498995282435991), (574, 0.05481458082599199), (379, 0.0411602263846608), (319, 0.03498995282435991), (273, 0.0411602263

In [9]:
def apply_my_map(doc_info_list):
    total_mapped = []

    for doc_info in doc_info_list:
        mapped = my_map(doc_info)
        total_mapped.extend(mapped)

    return total_mapped

mapped_list = apply_my_map(doc_info_list)

In [10]:
def my_map(doc_info):
    mapped_doc = []

    doc_id, terms = doc_info
    
    max_term_id = max(terms, key=lambda x: x[0])[0]
    doc_terms = [(term_id, value) for term_id, value in terms]
    mapped_doc.append((doc_id, max_term_id, doc_terms))

    return mapped_doc




In [11]:
def my_reduce(docs, threshold):
    pairs = []

    n_docs = len(docs)
    print(n_docs)
    for i in range(n_docs-1):
        for j in range(i + 1, n_docs):

            doc1_id, term_id, doc1 = docs[i]
            doc2_id, _, doc2 = docs[j]

            terms_1 = {t_id1: val1 for t_id1, val1 in doc1}
            terms_2 = {t_id2: val2 for t_id2, val2 in doc2}

            common_terms = set(terms_1).intersection(terms_2)

            if not common_terms:
                continue

            max_term = max(common_terms)

            if term_id != max_term:
                continue

            sim = 0.0

            for term in common_terms:
                sim += terms_1[term] * terms_2[term]

            if sim >= threshold:
                if(doc2_id == n_docs):
                    doc2_id+=1
                pair = ((doc1_id, doc2_id), sim)
                pairs.append(pair)

    return pairs


pairs = my_reduce(mapped_list,0.1)


## Execution

In [15]:
import os
os.environ['PYSPARK_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/lita4/anaconda3/python.exe'

In [16]:
doc_info_list = extract_document_terms(tfidf_matrix_docs)

In [50]:
from pyspark import SparkConf, SparkContext

def my_map(doc_info):
    mapped_doc = []

    doc_id, terms = doc_info
    
    max_term_id = max(terms, key=lambda x: x[0])[0]
    doc_terms = [(term_id, value) for term_id, value in terms]
    mapped_doc.append((doc_id, max_term_id, doc_terms))

    return mapped_doc

def my_reduce(docs1, docs2, threshold):
    pairs = []

    doc1_id, term_id, doc1 = docs1
    doc2_id, _, doc2 = docs2

    if doc1_id == doc2_id:
        return pairs

    terms_1 = {t_id1: val1 for t_id1, val1 in doc1}
    terms_2 = {t_id2: val2 for t_id2, val2 in doc2}

    common_terms = set(terms_1).intersection(terms_2)

    if not common_terms:
        return pairs

    max_term = max(common_terms)

    if term_id != max_term:
        return pairs

    sim = 0.0

    for term in common_terms:
        sim += terms_1[term] * terms_2[term]

    if sim >= threshold:
        pair = ((doc1_id, doc2_id), sim)
        pairs.append(pair)

    return pairs







conf = SparkConf().setAppName("MyApp").setMaster("local[8]").set("spark.executor.memory", "16g")
sc = SparkContext(conf=conf)

print("INPUT")
input_rdd = sc.parallelize(doc_info_list)
#print(input_rdd.collect())

print("MAP")
mapped_rdd = input_rdd.flatMap(my_map)
#print(mapped_rdd.collect())

print("REDUCE")
threshold = 0.2 # Replace with the desired threshold value
reduced_pairs = mapped_rdd.cartesian(mapped_rdd).flatMap(lambda x: my_reduce(x[0], x[1],threshold))
results = reduced_pairs.collect()

#results = list(set(results))
for result in sorted(results):
    print(result)
sc.stop()


INPUT
MAP
REDUCE
((4, 2), 0.24430977586273817)
((4, 3), 0.23661553124674634)
((4, 9), 0.20776676160449167)
((5, 1), 0.20287344196646448)
((5, 2), 0.21300829193208734)
((5, 3), 0.3127092108260041)
((5, 9), 0.20411435820862162)
((8, 1), 0.21087036194132178)
((9, 2), 0.27037729625719414)
((9, 3), 0.26325494003905425)
((9, 4), 0.20776676160449167)
((9, 5), 0.20411435820862162)


INPUT
MAP
REDUCE
((4, 9), 0.20776676160449167)
((5, 9), 0.20411435820862162)
