In [1]:
import re
import pandas as pd
from rdflib import RDF, RDFS, XSD, Graph, Literal, Namespace, URIRef
from rdflib.namespace import OWL
from sentence_transformers import SentenceTransformer, util
import ast
import gc 
import os
import time


def sanitize_for_uri(value) -> str:
    """
    Generic sanitization function for URIs

    :param value: value to sanitize

    :return: sanitized value
    """
    return re.sub(r"[^a-zA-Z0-9_]", "", str(value))

UNICA = Namespace("https://github.com/tail-unica/kgeats/")
SCHEMA = Namespace("https://schema.org/")

dizionario_hum = {}
dizionario_off = {}

hum_file = "../csv_file/pp_recipes_normalized_by_pipeline.csv"
off_file = "../csv_file/off_normalized_final.csv"
hum_off_file = "../csv_file/file_off_hummus_filtered_90.csv"
file_output_nt =  "../csv_file/ontology_merge.nt"

chunksize = 100000
cont_chunk = 0

for df_off_chunk in pd.read_csv(off_file, sep="\t", on_bad_lines="skip", chunksize=chunksize, low_memory=False, usecols=["product_name_normalized", "code"]):
    print(f"Processing rows off from {chunksize * cont_chunk} to {chunksize * (cont_chunk+1)}")
    
    for idx, row in df_off_chunk.iterrows():
        if(row["product_name_normalized"] != None and row["product_name_normalized"] != ""):
            id = URIRef(value=UNICA[f"Recipe_off_{row["code"]}"])
            if id != None:
                if row["product_name_normalized"] not in dizionario_off:
                    dizionario_off[row["product_name_normalized"]] = [id]
                else: 
                    dizionario_off[row["product_name_normalized"]].append(id)
    cont_chunk += 1

cont_chunk = 0
for df_hum_chunk in pd.read_csv(hum_file, sep=";", on_bad_lines="skip", chunksize=chunksize, low_memory=False, usecols=["title_normalized", "recipe_id"]):
    print(f"Processing rows hummus from {chunksize * cont_chunk} to {chunksize * (cont_chunk+1)}")
    
    for idx, row in df_hum_chunk.iterrows():
        if(row["title_normalized"] != None and row["title_normalized"] != ""):
            id = URIRef(UNICA[f"Recipe_hummus{sanitize_for_uri(row['recipe_id'])}"])
            if id != None:
                if row["title_normalized"] not in dizionario_hum:
                    dizionario_hum[row["title_normalized"]] = [id]
                else: 
                    dizionario_hum[row["title_normalized"]].append(id)
    cont_chunk += 1


Processing rows off from 0 to 100000
Processing rows off from 100000 to 200000
Processing rows off from 200000 to 300000
Processing rows off from 300000 to 400000
Processing rows off from 400000 to 500000
Processing rows off from 500000 to 600000
Processing rows off from 600000 to 700000
Processing rows off from 700000 to 800000
Processing rows off from 800000 to 900000
Processing rows off from 900000 to 1000000
Processing rows off from 1000000 to 1100000
Processing rows off from 1100000 to 1200000
Processing rows off from 1200000 to 1300000
Processing rows off from 1300000 to 1400000
Processing rows off from 1400000 to 1500000
Processing rows off from 1500000 to 1600000
Processing rows off from 1600000 to 1700000
Processing rows off from 1700000 to 1800000
Processing rows off from 1800000 to 1900000
Processing rows off from 1900000 to 2000000
Processing rows off from 2000000 to 2100000
Processing rows off from 2100000 to 2200000
Processing rows off from 2200000 to 2300000
Processing r

In [None]:
print(len(dizionario_hum))
print(len(dizionario_off))


max

313821
942068


In [8]:
i = 0
for ricetta in dizionario_off["bread"]:
    print(ricetta)
    i += 1

print(i)


i = 0
for ricetta in dizionario_hum["bread"]:
    print(ricetta)
    i += 1

print(i)

https://github.com/tail-unica/kgeats/Recipe_off_145
https://github.com/tail-unica/kgeats/Recipe_off_147
https://github.com/tail-unica/kgeats/Recipe_off_105753078
https://github.com/tail-unica/kgeats/Recipe_off_520002997
https://github.com/tail-unica/kgeats/Recipe_off_6231
https://github.com/tail-unica/kgeats/Recipe_off_6626
https://github.com/tail-unica/kgeats/Recipe_off_682009841
https://github.com/tail-unica/kgeats/Recipe_off_719008861
https://github.com/tail-unica/kgeats/Recipe_off_794500002
https://github.com/tail-unica/kgeats/Recipe_off_894242920
https://github.com/tail-unica/kgeats/Recipe_off_946803116
https://github.com/tail-unica/kgeats/Recipe_off_978623508
https://github.com/tail-unica/kgeats/Recipe_off_979332830
https://github.com/tail-unica/kgeats/Recipe_off_979798194
https://github.com/tail-unica/kgeats/Recipe_off_979798206
https://github.com/tail-unica/kgeats/Recipe_off_982614051
https://github.com/tail-unica/kgeats/Recipe_off_1001244147
https://github.com/tail-unica/kgeat

In [None]:
numchunk = 0
chunksize = 1000
contatore = 0
hum_off_file = "../csv_file/file_off_hummus_filtered_99.csv"

total_lines = sum(1 for _ in open(hum_off_file, encoding="utf-8")) - 1
total_chunks = (total_lines // chunksize) + 1
start_total = time.time()


for df_merge_chunk in pd.read_csv(hum_off_file, sep=",", on_bad_lines="skip", chunksize=chunksize, low_memory=False, usecols=["title_normalized", "product_name_normalized"]):
    chunk_start = time.time()
    print(f"\nProcessing chunk {numchunk+1}/{total_chunks}")

    for row in df_merge_chunk.itertuples(index=False):
        title = row.title_normalized
        product = row.product_name_normalized

        if title in dizionario_hum and product in dizionario_off:
            for hum_ricetta in dizionario_hum[title]:
                #massimo_ricette = 1000
                for off_ricetta in dizionario_off[product]: 
                    contatore += 1
                    #massimo_ricette -= 1
                    #if massimo_ricette == 0:
                    #    break

    del df_merge_chunk
    gc.collect() 

    chunk_time = time.time() - chunk_start
    avg_time_per_chunk = (time.time() - start_total) / (numchunk + 1)
    remaining_chunks = total_chunks - (numchunk + 1)
    est_remaining = avg_time_per_chunk * remaining_chunks
    print(f"Chunk time: {chunk_time:.2f}s — Estimated remaining: {est_remaining/60:.1f} min")
    print(contatore)
    numchunk += 1

total_time = time.time() - start_total
print(f"\nTotal processing time: {total_time/60:.2f} minutes")


Processing chunk 1/2200
Chunk time: 85.08s — Estimated remaining: 3118.4 min
588685069

Processing chunk 2/2200
Chunk time: 2.13s — Estimated remaining: 1597.6 min
603449496

Processing chunk 3/2200
Chunk time: 2.31s — Estimated remaining: 1092.8 min
620131197

Processing chunk 4/2200
Chunk time: 1.58s — Estimated remaining: 833.7 min
627163183

Processing chunk 5/2200
Chunk time: 1.57s — Estimated remaining: 678.1 min
631169208

Processing chunk 6/2200
Chunk time: 1.51s — Estimated remaining: 574.1 min
635660116

Processing chunk 7/2200
Chunk time: 1.61s — Estimated remaining: 500.3 min
637933686

Processing chunk 8/2200
Chunk time: 1.47s — Estimated remaining: 444.2 min
639679369

Processing chunk 9/2200
Chunk time: 1.43s — Estimated remaining: 400.5 min
641842081

Processing chunk 10/2200
Chunk time: 1.34s — Estimated remaining: 365.2 min
643231966

Processing chunk 11/2200
Chunk time: 1.36s — Estimated remaining: 336.3 min
644441870

Processing chunk 12/2200
Chunk time: 1.42s — Es

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f6a8be3b230>>
Traceback (most recent call last):
  File "/home/gzedda/miniconda3/envs/ambientez/lib/python3.13/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Chunk time: 1.40s — Estimated remaining: 44.9 min
692685423

Processing chunk 454/2200
Chunk time: 1.37s — Estimated remaining: 44.8 min
692695104

Processing chunk 455/2200
Chunk time: 1.31s — Estimated remaining: 44.8 min
692702405

Processing chunk 456/2200
Chunk time: 1.36s — Estimated remaining: 44.8 min
692711388

Processing chunk 457/2200
Chunk time: 1.29s — Estimated remaining: 44.7 min
692724114

Processing chunk 458/2200
Chunk time: 1.35s — Estimated remaining: 44.7 min
692737991

Processing chunk 459/2200
Chunk time: 1.46s — Estimated remaining: 44.7 min
692754995

Processing chunk 460/2200


In [None]:
numchunk = 0
chunksize = 10

total_lines = sum(1 for _ in open(hum_off_file, encoding="utf-8")) - 1
total_chunks = (total_lines // chunksize) + 1
start_total = time.time()

with open(file_output_nt, "w", encoding="utf-8") as f_out:

    for df_merge_chunk in pd.read_csv(hum_off_file, sep=",", on_bad_lines="skip", chunksize=chunksize, low_memory=False, usecols=["title_normalized", "product_name_normalized"]):
        chunk_start = time.time()
        if numchunk % 100 == 0:
            print(f"\nProcessing chunk {numchunk+1}/{total_chunks}")

        for row in df_merge_chunk.itertuples(index=False):
            title = row.title_normalized
            product = row.product_name_normalized

            if title in dizionario_hum and product in dizionario_off:
                for hum_ricetta in dizionario_hum[title]:
                    for off_ricetta in dizionario_off[product]: 
                        triple_str = f"<{off_ricetta}> <https://schema.org/sameAs> <{hum_ricetta}> .\n"
                        f_out.write(triple_str)

        del df_merge_chunk
        gc.collect() 

        chunk_time = time.time() - chunk_start
        avg_time_per_chunk = (time.time() - start_total) / (numchunk + 1)
        remaining_chunks = total_chunks - (numchunk + 1)
        est_remaining = avg_time_per_chunk * remaining_chunks
        if numchunk % 100 == 0:
            print(f"Chunk time: {chunk_time:.2f}s — Estimated remaining: {est_remaining/60:.1f} min")
        numchunk += 1

    total_time = time.time() - start_total
    print(f"\nTotal processing time: {total_time/60:.2f} minutes")



Processing chunk 1/12374
Chunk time: 360.23s — Estimated remaining: 74288.0 min

Processing chunk 101/12374
Chunk time: 1.15s — Estimated remaining: 1006.0 min

Processing chunk 201/12374
Chunk time: 1.04s — Estimated remaining: 612.0 min

Processing chunk 301/12374
Chunk time: 1.13s — Estimated remaining: 480.2 min

Processing chunk 401/12374
Chunk time: 1.12s — Estimated remaining: 413.2 min

Processing chunk 501/12374
Chunk time: 1.03s — Estimated remaining: 369.5 min


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fc491033230>>
Traceback (most recent call last):
  File "/home/gzedda/miniconda3/envs/ambientez/lib/python3.13/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
