# Removing spin_id,3d_model_id,node_id


In [1]:
import kagglehub
from collections import Counter
import os
import json
from glob import glob


In [2]:
import os
import json
from glob import glob
import kagglehub

dataset_dir = kagglehub.dataset_download("shaikmdirfan/images")
metadata_dir = os.path.join(dataset_dir, 'metadata/metadata')

output_dir = "/kaggle/working/cleaned_metadata"
os.makedirs(output_dir, exist_ok=True)

json_files = glob(os.path.join(metadata_dir, "listings_*.json"))

for file_path in json_files:
    cleaned_data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue 

            try:
                entry = json.loads(line)

                for unwanted in ["spin_id", "3dmodel_id"]:
                    if unwanted in entry:
                        del entry[unwanted]

                if isinstance(entry.get("node"), list):
                    node_list = entry["node"]
                    if len(node_list) == 1 and isinstance(node_list[0], dict):
                        node_item = node_list[0]
                        if "node_name" in node_item:
                            entry["node_name"] = node_item["node_name"]
                    # Remove the original "node" key
                    entry.pop("node", None)


                cleaned_data.append(entry)

            except json.JSONDecodeError as e:
                print(f" Skipping malformed JSON in {file_path}: {e}")
                continue

    output_path = os.path.join(output_dir, os.path.basename(file_path))
    with open(output_path, "w", encoding="utf-8") as f:
        for entry in cleaned_data:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")

print(f"✅ Cleaned {len(json_files)} files. Output saved to: {output_dir}")


✅ Cleaned 16 files. Output saved to: /kaggle/working/cleaned_metadata


In [3]:
json_files = glob(os.path.join(metadata_dir, "listings_*.json"))
for file_path in json_files:
    count = 0
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip(): 
                count += 1
    file_name = os.path.basename(file_path)
    print(f"{file_name}: {count} entries")

listings_3.json: 9232 entries
listings_d.json: 9232 entries
listings_b.json: 9232 entries
listings_a.json: 9232 entries
listings_4.json: 9232 entries
listings_8.json: 9232 entries
listings_9.json: 9232 entries
listings_f.json: 9222 entries
listings_0.json: 9232 entries
listings_6.json: 9232 entries
listings_c.json: 9232 entries
listings_2.json: 9232 entries
listings_5.json: 9232 entries
listings_1.json: 9232 entries
listings_e.json: 9232 entries
listings_7.json: 9232 entries


In [4]:
cleaned_dir = "/kaggle/working/cleaned_metadata"
json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

errors_found = False

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, start=1):
            try:
                entry = json.loads(line)

                if 'spin_id' in entry:
                    print(f"'spin_id' found in {file_path}, line {line_num}")
                    errors_found = True

                if '3dmodel_id' in entry:
                    print(f"'3dmodel_id' found in {file_path}, line {line_num}")
                    errors_found = True

                

            except json.JSONDecodeError as e:
                print(f"JSON error in {file_path}, line {line_num}: {e}")
                errors_found = True

if not errors_found:
    print("All cleaned files are verified — no 'spin_id', '3dmodel_id', or 'node_id' found.")
else:
    print("Some unwanted keys still exist. See messages above.")


All cleaned files are verified — no 'spin_id', '3dmodel_id', or 'node_id' found.


# Removing data of language tags other than En_**

In [5]:

cleaned_dir = "/kaggle/working/cleaned_metadata"

json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

language_tags = set()
fields_with_language_tag = [
    "brand", "bullet_point", "fabric_type", "finish_type", "item_keywords",
    "item_name", "item_shape", "material", "model_name", "model_number",
    "model_year", "pattern", "product_description", "style", "color"
]

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)
                for field in fields_with_language_tag:
                    if field in entry and isinstance(entry[field], list):
                        for item in entry[field]:
                            lang = item.get("language_tag")
                            if lang:
                                language_tags.add(lang)
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSON error in {file_path}: {e}")

print("Unique language_tag values:")
for lang in sorted(language_tags):
    print(lang)



Unique language_tag values:
ar_AE
cs_CZ
de_DE
en_AE
en_AU
en_CA
en_GB
en_IN
en_SG
en_US
es_ES
es_MX
es_US
fr_CA
fr_FR
he_IL
hi_IN
it_IT
ja_JP
kn_IN
ko_KR
ml_IN
mr_IN
nl_NL
pl_PL
pt_BR
pt_PT
sv_SE
ta_IN
te_IN
tr_TR
zh_CN
zh_TW


In [6]:
print(f"\nNumber of unique language_tag values: {len(language_tags)}")


Number of unique language_tag values: 33


In [7]:
import os
import json
from glob import glob

cleaned_dir = "/kaggle/working/cleaned_metadata"

fields_with_language_tag = [
    "brand", "bullet_point", "fabric_type", "finish_type", "item_keywords",
    "item_name", "item_shape", "material", "model_name", "model_number",
    "model_year", "pattern", "product_description", "style", "color"
]

def is_english_or_missing(tag):
    # Accept if it's English or missing
    return tag is None or (isinstance(tag, str) and tag.lower().startswith("en_"))

json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

for file_path in json_files:
    filtered_entries = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)

                # Filter each specified field
                for field in fields_with_language_tag:
                    if field in entry and isinstance(entry[field], list):
                        filtered_field_items = []
                        for item in entry[field]:
                            tag = item.get("language_tag", None)
                            if is_english_or_missing(tag):
                                filtered_field_items.append(item)
                        entry[field] = filtered_field_items

                filtered_entries.append(entry)

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

    # Safely overwrite the same file
    with open(file_path, "w", encoding="utf-8") as f:
        for entry in filtered_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

print("Files updated: retained items without 'language_tag' and removed non-English ones.")


Files updated: retained items without 'language_tag' and removed non-English ones.


In [8]:

cleaned_dir = "/kaggle/working/cleaned_metadata"

json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

language_tags = set()
fields_with_language_tag = [
    "brand", "bullet_point", "fabric_type", "finish_type", "item_keywords",
    "item_name", "item_shape", "material", "model_name", "model_number",
    "model_year", "pattern", "product_description", "style", "color"
]

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)
                for field in fields_with_language_tag:
                    if field in entry and isinstance(entry[field], list):
                        for item in entry[field]:
                            lang = item.get("language_tag")
                            if lang:
                                language_tags.add(lang)
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSON error in {file_path}: {e}")

print("Unique language_tag values:")
for lang in sorted(language_tags):
    print(lang)



Unique language_tag values:
en_AE
en_AU
en_CA
en_GB
en_IN
en_SG
en_US


In [9]:
import json
from glob import glob
import os

cleaned_dir = "/kaggle/working/cleaned_metadata"
json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

fields_to_count = [
    "brand", "bullet_point", "fabric_type", "finish_type", "item_keywords",
    "item_name", "item_shape", "material", "model_name", "model_number",
    "model_year", "pattern", "product_description", "style", "color"
]

field_counts = {field: 0 for field in fields_to_count}

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)

                # Count occurrences of each field
                for field in fields_to_count:
                    if field in entry:
                        field_counts[field] += 1

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

# Print the counts for each field
print("\nField Occurrence Counts:")
for field, count in field_counts.items():
    print(f"{field}: {count}")




Field Occurrence Counts:
brand: 147643
bullet_point: 131570
fabric_type: 8193
finish_type: 1536
item_keywords: 126776
item_name: 147702
item_shape: 5066
material: 53585
model_name: 81579
model_number: 124091
model_year: 7765
pattern: 4590
product_description: 4240
style: 43188
color: 116180


In [10]:
json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

# Fields to process
fields_with_language_tag = [
    "brand", "bullet_point", "fabric_type", "finish_type", "item_keywords",
    "item_name", "item_shape", "material", "model_name", "model_number",
    "model_year", "pattern", "product_description", "style", "color"
]

for file_path in json_files:
    updated_entries = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)

                for field in fields_with_language_tag:
                    if field in entry and isinstance(entry[field], list):
                        # Extract 'value' and ignore other keys
                        entry[field] = [item["value"] for item in entry[field] if "value" in item]

                updated_entries.append(entry)

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

    with open(file_path, "w", encoding="utf-8") as f:
        for entry in updated_entries:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")

print("Successfully removed 'language_tag' and converted field values to lists of strings.")


Successfully removed 'language_tag' and converted field values to lists of strings.


In [11]:
json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

fields_to_check = [
    "brand", "bullet_point", "fabric_type", "finish_type", "item_keywords",
    "item_name", "item_shape", "material", "model_name", "model_number",
    "model_year", "pattern", "product_description", "style", "color"
]

invalid_entries = 0
valid_entries = 0

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)

                for field in fields_to_check:
                    if field in entry:
                        if field == "model_year":
                            if not isinstance(entry[field], list) or not all(isinstance(x, int) for x in entry[field]):
                                invalid_entries += 1
                                print(f"Invalid format in field '{field}' for item_id: {entry.get('item_id')}")
                            else:
                                valid_entries += 1
                        else:
                            if not isinstance(entry[field], list) or not all(isinstance(x, str) for x in entry[field]):
                                invalid_entries += 1
                                print(f"Invalid format in field '{field}' for item_id: {entry.get('item_id')}")
                            else:
                                valid_entries += 1

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

print("\nVerification Complete")
print(f"Valid string list fields: {valid_entries}")
print(f"Invalid entries (non-list or non-string values): {invalid_entries}")



Verification Complete
Valid string list fields: 1003704
Invalid entries (non-list or non-string values): 0


In [12]:
import json
from glob import glob
import os

cleaned_dir = "/kaggle/working/cleaned_metadata"
json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

fields_to_count = [
    "brand", "bullet_point", "fabric_type", "finish_type", "item_keywords",
    "item_name", "item_shape", "material", "model_name", "model_number",
    "model_year", "pattern", "product_description", "style", "color"
]

field_counts = {field: 0 for field in fields_to_count}

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)

                for field in fields_to_count:
                    if field in entry:
                        field_counts[field] += 1

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

print("\nField Occurrence Counts:")
for field, count in field_counts.items():
    print(f"{field}: {count}")




Field Occurrence Counts:
brand: 147643
bullet_point: 131570
fabric_type: 8193
finish_type: 1536
item_keywords: 126776
item_name: 147702
item_shape: 5066
material: 53585
model_name: 81579
model_number: 124091
model_year: 7765
pattern: 4590
product_description: 4240
style: 43188
color: 116180


* Field Occurrence Counts:
* brand: 147643
* bullet_point: 131570
* fabric_type: 8193
* finish_type: 1536
* item_keywords: 126776
* item_name: 147702
* item_shape: 5066
* material: 53585
* model_name: 81579
* model_number: 124091
* model_year: 7765
* pattern: 4590
* product_description: 4240
* style: 43188
* color: 116180

In [13]:
d = 0
for file_path in json_files:
    count = 0
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # skip empty lines, just in case
                count += 1
        d+=count
    file_name = os.path.basename(file_path)
    print(f"{file_name}: {count} entries")
print(d)

listings_0.json: 9232 entries
listings_5.json: 9232 entries
listings_d.json: 9232 entries
listings_2.json: 9232 entries
listings_a.json: 9232 entries
listings_3.json: 9232 entries
listings_6.json: 9232 entries
listings_1.json: 9232 entries
listings_b.json: 9232 entries
listings_f.json: 9222 entries
listings_8.json: 9232 entries
listings_7.json: 9232 entries
listings_c.json: 9232 entries
listings_9.json: 9232 entries
listings_4.json: 9232 entries
listings_e.json: 9232 entries
147702


# Checking unique Domain names

In [14]:
domain_names = set()
json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)
                domain_name = entry.get("domain_name")
                if domain_name:
                    domain_names.add(domain_name)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

print("Unique domain_name values:")
for domain in sorted(domain_names):
    print(domain)

Unique domain_name values:
amazon.ae
amazon.ca
amazon.co.jp
amazon.co.uk
amazon.com
amazon.com.au
amazon.com.br
amazon.com.mx
amazon.com.tr
amazon.com/go
amazon.de
amazon.es
amazon.fr
amazon.in
amazon.it
amazon.nl
amazon.pl
amazon.sa
amazon.se
amazon.sg
amazondistribution.in
fresh.amazon.com
primenow.amazon.ca
primenow.amazon.co.jp
primenow.amazon.co.uk
primenow.amazon.com
primenow.amazon.de
primenow.amazon.es
primenow.amazon.fr
primenow.amazon.it
wholefoodsmarket.com
woot.com


# Checking unique item_ids

In [15]:
item_ids = set()
c=0
d=0
for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)
                c += 1
                item_id = entry.get("item_id")
                if item_id:
                    d+=1
                    item_ids.add(item_id)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

print(f"Number of unique item_id values: {len(item_ids)}")
print(f"Total Entries:{c}")
print(f"Total Entries with item_id:{d}")

Number of unique item_id values: 145615
Total Entries:147702
Total Entries with item_id:147702


# Checking Normalised values and other values in item_dimensions and item_weight

In [16]:
dimensions_mismatch = 0
dimensions_checked = 0

weights_mismatch = 0
weights_checked = 0

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)

                dims = entry.get("item_dimensions", {})
                if isinstance(dims, dict):
                    for dim_key, dim_val in dims.items():
                        if isinstance(dim_val, dict):
                            norm = dim_val.get("normalized_value", {})
                            if all(k in dim_val for k in ["unit", "value"]) and all(k in norm for k in ["unit", "value"]):
                                dimensions_checked += 1
                                if dim_val["unit"] != norm["unit"] or dim_val["value"] != norm["value"]:
                                    dimensions_mismatch += 1

                weights = entry.get("item_weight", [])
                if isinstance(weights, list):
                    for w in weights:
                        norm = w.get("normalized_value", {})
                        if all(k in w for k in ["unit", "value"]) and all(k in norm for k in ["unit", "value"]):
                            weights_checked += 1
                            if w["unit"] != norm["unit"] or w["value"] != norm["value"]:
                                weights_mismatch += 1

            except json.JSONDecodeError:
                continue

print("item_dimensions:")
print(f"  Total checked: {dimensions_checked}")
print(f"  Mismatches: {dimensions_mismatch}")

print("item_weight:")
print(f"  Total checked: {weights_checked}")
print(f"  Mismatches: {weights_mismatch}")


item_dimensions:
  Total checked: 130014
  Mismatches: 26040
item_weight:
  Total checked: 106194
  Mismatches: 78270


In [17]:


missing_keys_report = {
    "item_dimensions": [],
    "item_weight": []
}

required_keys = ["value", "unit", "normalized_value"]

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line_number, line in enumerate(f, 1):
            try:
                entry = json.loads(line)
                for field in ["item_dimensions", "item_weight"]:
                    if field in entry and isinstance(entry[field], list):
                        for idx, item in enumerate(entry[field]):
                            for key in required_keys:
                                if key not in item:
                                    missing_keys_report[field].append({
                                        "file": file_path,
                                        "line": line_number,
                                        "missing_key": key,
                                        "entry_index": idx
                                    })
            except json.JSONDecodeError:
                continue

for field, issues in missing_keys_report.items():
    print(f"\n {len(issues)} missing '{field}' keys:")
    for issue in issues[:10]:  # Show only first 10 for brevity
        print(issue)

print("\nCheck complete.")



 0 missing 'item_dimensions' keys:

 0 missing 'item_weight' keys:

Check complete.


In [18]:


total_files = 0

total_weight_checked = 0
weight_missing_value_or_unit = 0
weight_found_normalized_value = 0

total_dimensions_checked = 0
dimensions_missing_value_or_unit = 0
dimensions_found_normalized_value = 0

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)
                total_files += 1

                if "item_weight" in entry and isinstance(entry["item_weight"], list):
                    for item in entry["item_weight"]:
                        total_weight_checked += 1
                        if "value" not in item or "unit" not in item:
                            weight_missing_value_or_unit += 1
                        if "normalized_value" in item:
                            weight_found_normalized_value += 1

                if "item_dimensions" in entry and isinstance(entry["item_dimensions"], dict):
                    for dimension, dimension_details in entry["item_dimensions"].items():
                        total_dimensions_checked += 1
                        if "value" not in dimension_details or "unit" not in dimension_details:
                            dimensions_missing_value_or_unit += 1
                        if "normalized_value" in dimension_details:
                            dimensions_found_normalized_value += 1

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

print(" Validation Summary:")
print(f"Total entries processed: {total_files}\n")

print(" item_weight checks:")
print(f"  Total checked: {total_weight_checked}")
print(f"  Missing 'value' or 'unit': {weight_missing_value_or_unit}")
print(f"  Containing 'normalized_value': {weight_found_normalized_value}\n")

print(" item_dimensions checks:")
print(f"  Total checked: {total_dimensions_checked}")
print(f"  Missing 'value' or 'unit': {dimensions_missing_value_or_unit}")
print(f"  Containing 'normalized_value': {dimensions_found_normalized_value}")

if all(val == 0 for val in [
    weight_missing_value_or_unit,
    weight_found_normalized_value,
    dimensions_missing_value_or_unit,
    dimensions_found_normalized_value
]):
    print("\n All entries are valid: have 'value' and 'unit', and no 'normalized_value'.")


 Validation Summary:
Total entries processed: 147702

 item_weight checks:
  Total checked: 106194
  Missing 'value' or 'unit': 0
  Containing 'normalized_value': 106194

 item_dimensions checks:
  Total checked: 130014
  Missing 'value' or 'unit': 0
  Containing 'normalized_value': 130014


In [19]:

json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

for file_path in json_files:
    updated_entries = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)

                if "item_weight" in entry and isinstance(entry["item_weight"], list):
                    for weight in entry["item_weight"]:
                        weight.pop("normalized_value", None)

                if "item_dimensions" in entry and isinstance(entry["item_dimensions"], dict):
                    for key in entry["item_dimensions"]:
                        if isinstance(entry["item_dimensions"][key], dict):
                            entry["item_dimensions"][key].pop("normalized_value", None)

                updated_entries.append(entry)

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

    with open(file_path, "w", encoding="utf-8") as f:
        for entry in updated_entries:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")

print(" Removed 'normalized_value' from item_weight and item_dimensions in all files.")


 Removed 'normalized_value' from item_weight and item_dimensions in all files.


In [20]:
total_files = 0

total_weight_checked = 0
weight_has_value = 0
weight_has_unit = 0
weight_found_normalized_value = 0

total_dimensions_checked = 0
dimensions_has_value = 0
dimensions_has_unit = 0
dimensions_found_normalized_value = 0

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)
                total_files += 1

                if "item_weight" in entry and isinstance(entry["item_weight"], list):
                    for item in entry["item_weight"]:
                        total_weight_checked += 1
                        if "value" in item:
                            weight_has_value += 1
                        if "unit" in item:
                            weight_has_unit += 1
                        if "normalized_value" in item:
                            weight_found_normalized_value += 1

                if "item_dimensions" in entry and isinstance(entry["item_dimensions"], dict):
                    for dimension, dimension_details in entry["item_dimensions"].items():
                        total_dimensions_checked += 1
                        if "value" in dimension_details:
                            dimensions_has_value += 1
                        if "unit" in dimension_details:
                            dimensions_has_unit += 1
                        if "normalized_value" in dimension_details:
                            dimensions_found_normalized_value += 1

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

print(" Validation Summary:")
print(f"Total entries processed: {total_files}\n")

print(" item_weight checks:")
print(f"  Total checked: {total_weight_checked}")
print(f"  Entries with 'value': {weight_has_value}")
print(f"  Entries with 'unit': {weight_has_unit}")
print(f"  Containing 'normalized_value': {weight_found_normalized_value}\n")

print(" item_dimensions checks:")
print(f"  Total checked: {total_dimensions_checked}")
print(f"  Entries with 'value': {dimensions_has_value}")
print(f"  Entries with 'unit': {dimensions_has_unit}")
print(f"  Containing 'normalized_value': {dimensions_found_normalized_value}")


 Validation Summary:
Total entries processed: 147702

 item_weight checks:
  Total checked: 106194
  Entries with 'value': 106194
  Entries with 'unit': 106194
  Containing 'normalized_value': 0

 item_dimensions checks:
  Total checked: 130014
  Entries with 'value': 130014
  Entries with 'unit': 130014
  Containing 'normalized_value': 0


# Checking mapping between images and json files

In [21]:
import os
import json
from glob import glob

cleaned_dir = "/kaggle/working/cleaned_metadata"
json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

main_image_ids = set()
other_image_ids = set()
all_unique_ids = set()

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)

                main_id = entry.get("main_image_id")
                if main_id:
                    main_image_ids.add(main_id)
                    all_unique_ids.add(main_id)

                other_ids = entry.get("other_image_id", [])
                if isinstance(other_ids, list):
                    for oid in other_ids:
                        if oid:
                            other_image_ids.add(oid)
                            all_unique_ids.add(oid)

            except json.JSONDecodeError:
                continue

print(f"Unique main_image_id count: {len(main_image_ids)}")
print(f"Unique other_image_ids count: {len(other_image_ids)}")
print(f"Unique ids count: {len(all_unique_ids)}")

Unique main_image_id count: 123511
Unique other_image_ids count: 275505
Unique ids count: 398170


In [22]:
json_files = glob(os.path.join(cleaned_dir, "listings_*.json"))

fields_to_dedup = [
    "brand", "bullet_point", "fabric_type", "finish_type", "item_keywords",
    "item_name", "item_shape", "material", "model_name", "model_number",
    "model_year", "pattern", "product_description", "style", "color"
]

def deduplicate_list(lst):
    return list(dict.fromkeys(lst))

examples_shown = 0
MAX_EXAMPLES = 3

for file_path in json_files:
    updated_lines = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                entry = json.loads(line)
                modified = False
                item_id = entry.get("item_id", "UNKNOWN_ID")
                
                for field in fields_to_dedup:
                    if field in entry and isinstance(entry[field], list):
                        original = entry[field]
                        deduped = deduplicate_list(original)
                        if len(original) != len(deduped):
                            entry[field] = deduped
                            modified = True
                            if examples_shown < MAX_EXAMPLES:
                                print(f"\nitem_id: {item_id} | Field: '{field}'")
                                print("  ➤ Before:", original)
                                print("  ➤ After: ", deduped)
                                examples_shown += 1

                updated_lines.append(json.dumps(entry))

            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

    with open(file_path, "w", encoding="utf-8") as f:
        for line in updated_lines:
            f.write(line + "\n")

print("\n✅ Deduplication complete and changes saved.")



item_id: B07CTPR73M | Field: 'item_keywords'
  ➤ Before: ['love', 'loveseat', 'queen', 'for', 'couch', 'chesterfield', 'rolled', 'couches', 'button', 'homelegance', 'red', 'daybed', 'and', 'trundle', 'savonburg', 'power', 'arm', 'reclining', 'farmhouse', 'a', 'sofa', 'loveseats', 'living', 'set', 'sets', 'room', 'leather', 'upholstered', 'seat', 'with', 'size', 'sofas', 'fabric', 'silver', 'tufted', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanity', 'outdoor fountain', 'wind spinners', 'windmill', 'vanit

In [23]:
d = 0
for file_path in json_files:
    count = 0
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  
                count += 1
        d+=count
    file_name = os.path.basename(file_path)
    print(f"{file_name}: {count} entries")
print(d)

listings_0.json: 9232 entries
listings_5.json: 9232 entries
listings_d.json: 9232 entries
listings_2.json: 9232 entries
listings_a.json: 9232 entries
listings_3.json: 9232 entries
listings_6.json: 9232 entries
listings_1.json: 9232 entries
listings_b.json: 9232 entries
listings_f.json: 9222 entries
listings_8.json: 9232 entries
listings_7.json: 9232 entries
listings_c.json: 9232 entries
listings_9.json: 9232 entries
listings_4.json: 9232 entries
listings_e.json: 9232 entries
147702


42 images don't have Json data