# Reading dataset

In [1]:
import json
import gzip

In [7]:
# Definiamo il contesto per tradurre i termini JSON in Python
# Questo dice a eval(): "Se trovi 'true', intendi True", ecc.
safe_context = {
    "true": True,
    "false": False,
    "null": None,
    "nan": float('nan') # A volte capita anche questo
}

In [9]:
def parse(file_path):
    with gzip.open(file_path, 'rt', encoding='utf-8') as gzip_file:
        for line in gzip_file:
            try:
                # Passiamo 'safe_context' come secondo argomento a eval
                # In questo modo 'true' viene letto correttamente come True
                yield json.dumps(eval(line, safe_context))
            except Exception as e:
                # Stampiamo l'errore specifico per capire se ci sono altri problemi
                # Ma non interrompiamo tutto
                # print(f"Skipping line due to: {e}")
                continue

In [10]:
def convert_dataset(input_filename, output_filename):
    print(f"Starting conversion: {input_filename} -> {output_filename}...")

    # Open the output file for writing
    with open(output_filename, 'w', encoding='utf-8') as f_out:
        # Iterate through the generator
        for i, json_line in enumerate(parse(input_filename)):
            f_out.write(json_line + '\n')

            # Print progress every 100,000 lines so you know it's working
            if (i + 1) % 1000000 == 0:
                print(f"Processed {i + 1} records...")

    print(f"Done! Saved to {output_filename}")

In [11]:
convert_dataset(input_filename = "data/Electronics.json.gz",
                output_filename = "data/reviews_electronics.json")

Starting conversion: data/Electronics.json.gz -> data/reviews_electronics.json...
Processed 1000000 records...
Processed 2000000 records...
Processed 3000000 records...
Processed 4000000 records...
Processed 5000000 records...
Processed 6000000 records...
Processed 7000000 records...
Processed 8000000 records...
Processed 9000000 records...
Processed 10000000 records...
Processed 11000000 records...
Processed 12000000 records...
Processed 13000000 records...
Processed 14000000 records...
Processed 15000000 records...
Processed 16000000 records...
Processed 17000000 records...
Processed 18000000 records...
Processed 19000000 records...
Processed 20000000 records...
Done! Saved to data/reviews_electronics.json


In [12]:
convert_dataset(input_filename = "data/meta_Electronics.json.gz",
                output_filename = "data/metadata_electronics.json")

Starting conversion: data/meta_Electronics.json.gz -> data/metadata_electronics.json...
Done! Saved to data/metadata_electronics.json
