# Exercise 2 (GPU) — JSON Schema and Data Validation — Solution

We validate a small GPU product dataset using JSON Schema.


## 1. Setup and Data Loading
Import libraries and load the GPU dataset.


In [1]:
import json
import pandas as pd
from pprint import pprint
from jsonschema import validate, ValidationError, Draft7Validator

### 1.1 Load and Inspect GPU Sample Data


In [2]:
with open('../Task/input/gpu_products.json', 'r', encoding='utf-8') as f:
    products_data = json.load(f)
print(f'Total records: {len(products_data)}')
print('First record keys:', list(products_data[0].keys()))
pprint(products_data[0])


Total records: 10
First record keys: ['id', 'name', 'brand', 'model', 'chipset', 'memory_gb', 'memory_type', 'clock_speed_mhz', 'tdp_w', 'launch_date', 'price_usd', 'description']
{'brand': 'NVIDIA',
 'chipset': 'AD104',
 'clock_speed_mhz': 2310,
 'description': 'GeForce RTX 4070 Ti with 12GB GDDR6X, boost up to 2.6 GHz, '
                '285W TDP. Great 1440p/4K performance. MSRP $799.',
 'id': 'gpu-001',
 'launch_date': '2023-01-05',
 'memory_gb': 12,
 'memory_type': 'GDDR6X',
 'model': 'RTX 4070 Ti',
 'name': 'NVIDIA GeForce RTX 4070 Ti',
 'price_usd': 799.0,
 'tdp_w': 285}


## 2. Creating Basic JSON Schemas
We define a schema compatible with the GPU dataset.


### 2.1 Define a GPU Product Schema
Define the schema inline (then we will save it at the end).


In [3]:
product_schema = {
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://example.org/schemas/gpu_product.schema.json",
  "title": "GPUProduct",
  "description": "Schema for a single discrete graphics card (GPU) product entry used in Exercise 02/03. Includes identification, specs, pricing and a free‑text description.",
  "type": "object",
  "properties": {
    "id": {
      "type": "string",
      "description": "Stable unique identifier for this product record (e.g., 'gpu-001')."
    },
    "name": {
      "type": "string",
      "description": "Human‑readable product title as seen on listings (e.g., 'NVIDIA GeForce RTX 4070 Ti')."
    },
    "brand": {
      "type": "string",
      "enum": ["NVIDIA", "AMD", "Intel", "Other"],
      "description": "GPU vendor/brand name."
    },
    "model": {
      "type": "string",
      "description": "Product model designation used in marketing (e.g., 'RTX 4070 Ti', 'RX 7800 XT')."
    },
    "chipset": {
      "type": "string",
      "description": "GPU die/codename or family identifier (e.g., 'AD104', 'Navi 32')."
    },
    "memory_gb": {
      "type": "integer",
      "minimum": 1,
      "description": "On‑board graphics memory capacity in gigabytes (GB)."
    },
    "memory_type": {
      "type": "string",
      "enum": ["GDDR6", "GDDR6X", "HBM2", "HBM3"],
      "description": "Type of graphics memory technology."
    },
    "clock_speed_mhz": {
      "type": "integer",
      "minimum": 1,
      "description": "Nominal boost or advertised GPU clock speed in megahertz (MHz)."
    },
    "tdp_w": {
      "type": "integer",
      "minimum": 1,
      "description": "Typical board power (TDP/TBP) in watts (W)."
    },
    "launch_date": {
      "type": "string",
      "format": "date",
      "description": "Public launch or availability date in ISO format (YYYY‑MM‑DD)."
    },
    "price_usd": {
      "type": "number",
      "minimum": 0,
      "description": "MSRP or typical retail price in US dollars (USD)."
    },
    "description": {
      "type": "string",
      "description": "Free‑text product description used for information extraction demos."
    }
  },
  "required": ["id", "name", "brand", "model", "description"],
  "additionalProperties": True
}


pprint(product_schema)


{'$id': 'https://example.org/schemas/gpu_product.schema.json',
 '$schema': 'https://json-schema.org/draft/2020-12/schema',
 'additionalProperties': True,
 'description': 'Schema for a single discrete graphics card (GPU) product '
                'entry used in Exercise 02/03. Includes identification, specs, '
                'pricing and a free‑text description.',
 'properties': {'brand': {'description': 'GPU vendor/brand name.',
                          'enum': ['NVIDIA', 'AMD', 'Intel', 'Other'],
                          'type': 'string'},
                'chipset': {'description': 'GPU die/codename or family '
                                           "identifier (e.g., 'AD104', 'Navi "
                                           "32').",
                            'type': 'string'},
                'clock_speed_mhz': {'description': 'Nominal boost or '
                                                   'advertised GPU clock speed '
                                               

### 2.2 Validate Valid Data
Validate the first record and then all records.


In [11]:
# Validate first record
try:
    validate(instance=products_data[0], schema=product_schema)
    print('✓ First product is valid')
except ValidationError as e:
    print('✗ First product failed:', e.message)

# Validate all
errors = []
for i, rec in enumerate(products_data):
    try:
        validate(instance=rec, schema=product_schema)
    except ValidationError as e:
        errors.append((i, e.message))

if errors:
    print(f'Found {len(errors)} errors in valid set (unexpected):')
    for i, msg in errors[:5]:
        print(f'  [{i}]', msg)
else:
    print('✓ All products passed validation')


✓ First product is valid
✓ All products passed validation


### 2.3 Test with Invalid Data
Verify that invalid examples are caught by the schema.


In [12]:
with open('../Task/input/gpu_products_invalid.json', 'r', encoding='utf-8') as f:
    invalid_products = json.load(f)

failures = 0
for idx, rec in enumerate(invalid_products):
    try:
        validate(instance=rec, schema=product_schema)
        print(f'[{idx}] ✗ Unexpectedly valid')
    except ValidationError as e:
        failures += 1
        print(f'[{idx}] ✓ Correctly failed: {e.message}')

print(f'Total invalid caught: {failures}/{len(invalid_products)}')


[0] ✓ Correctly failed: 0 is less than the minimum of 1
[1] ✓ Correctly failed: -50 is less than the minimum of 0
[2] ✓ Correctly failed: 'brand' is a required property
[3] ✓ Correctly failed: 770 is not of type 'string'
[4] ✓ Correctly failed: 0 is less than the minimum of 1
Total invalid caught: 5/5


### Save Schema
Write the defined schema to the shared Task input directory for reuse.


In [13]:
# Save schema to Task input directory
with open('../Task/input/gpu_product_schema.json','w', encoding='utf-8') as f:
    json.dump(product_schema, f, indent=2)
print('Schema saved to ../Task/input/gpu_product_schema.json')


Schema saved to ../Task/input/gpu_product_schema.json
