## Homework #1: Data Model Denormalization
#### Big Data Structure
**DIA 2 - Claire CUCHE, INES Darde, Yasmine MAARBANI and Aya BOUANANE** 

***

![Entity-Relationship Model](model.png)

### Input Statistics

In [59]:
nb_client = 10**7
nb_product = 10**5
nb_orderline = 4*10**9
nb_warehouse = 200
nb_stock = nb_client * nb_warehouse

In [60]:
nb_stock = nb_warehouse * nb_client

In [61]:
avg_length_cat = 2
avg_length_stock = nb_warehouse

### Database size : 
- Integer/Number : 8B
- String : 80B
- Date : 20B (a specific string)
- LongString : 200B
- Key+Value pairs/Arrays : 12B + values


In [62]:
size_number = 8
size_string = 80
size_date = 20
size_longString = 200
size_keyValue = 12

In [63]:
import json

# Ton JSON Schema (colle exactement celui que tu as donné)
schema_str = """
{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "title": "Product",
  "type": "object",

  "properties": {
    "IDP": {
      "type": "integer"
    },
    "name": {
      "type": "string"
    },
    "price": {
      "type": "number"
    },
    "brand": {
      "type": "string"
    },
    "description": {
      "type": "string"
    },
    "image_url": {
      "type": "string"
    },

    "categories": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "title": {
            "type": "string"
          }
        },
        "required": ["title"]
      }
    },

    "supplier": {
      "type": "object",
      "properties": {
        "IDS": { "type": "integer" },
        "name": { "type": "string" },
        "SIRET": { "type": "integer" },
        "headOffice": { "type": "string" },
        "revenue": { "type": "number" }
      },
      "required": ["IDS", "name"]
    },

    "stocks": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "warehouse_id": { "type": "integer" },
          "warehouse_name": { "type": "string" },
          "quantity": { "type": "integer" }
        },
        "required": ["warehouse_id", "quantity"]
      }
    }
  },

  "required": ["IDP", "name", "price", "categories", "supplier", "stocks"]
}


"""

In [64]:
def count_scalars_with_arrays(schema):
    """
    Retourne :
      - scalaires hors tableaux
      - scalaires dans tableaux, PAR nom de tableau
    """

    counts_outside = {"int": 0, "string": 0, "date": 0, "long": 0}
    counts_inside = {}  # <- un dict par array : { arrayName: {types} }

    def init_array_counter(name):
        if name not in counts_inside:
            counts_inside[name] = {"int": 0, "string": 0, "date": 0, "long": 0}

    def add_scalar(node_type, field_name, inside_array):
        """
        inside_array = None si hors array
        inside_array = nom du tableau si dedans
        """
        if inside_array is None:
            target = counts_outside
        else:
            init_array_counter(inside_array)
            target = counts_inside[inside_array]

        if node_type in ["integer", "number"]:
            target["int"] += 1

        elif node_type == "string":
            if field_name in ["description", "comment"]:
                target["long"] += 1
            else:
                target["string"] += 1

        elif node_type == "date":
            target["date"] += 1

    def explore(node, field_name=None, inside_array=None):
        if not isinstance(node, dict):
            return

        node_type = node.get("type")

        # ARRAY → inside_array devient le nom du tableau
        if node_type == "array":
            items = node.get("items")
            array_name = field_name  # nom du tableau
            init_array_counter(array_name)

            if isinstance(items, dict):
                explore(items, inside_array=array_name)

            elif isinstance(items, list):
                for it in items:
                    explore(it, inside_array=array_name)
            return

        # OBJECT
        if node_type == "object":
            props = node.get("properties", {})
            for name, subnode in props.items():
                explore(subnode, field_name=name, inside_array=inside_array)
            return

        # SCALAR
        add_scalar(node_type, field_name, inside_array)

    explore(schema)
    return counts_outside, counts_inside


In [65]:
schema = json.loads(schema_str)

# LANCEMENT
outside, inside = count_scalars_with_arrays(schema)

print("SCALAIRES HORS TABLEAUX :")
print(outside)

print("\nSCALAIRES DANS TABLEAUX :")
print(inside)


SCALAIRES HORS TABLEAUX :
{'int': 5, 'string': 5, 'date': 0, 'long': 1}

SCALAIRES DANS TABLEAUX :
{'categories': {'int': 0, 'string': 1, 'date': 0, 'long': 0}, 'stocks': {'int': 2, 'string': 1, 'date': 0, 'long': 0}}


In [66]:
def count_merges(schema):
    """
    Un merge = au premier niveau :
        - un objet (type: object)
        - un tableau (type: array)
    Les scalaires ne comptent pas.
    """
    merges = 0

    if schema.get("type") != "object":
        return 0

    properties = schema.get("properties", {})

    for name, prop in properties.items():
        t = prop.get("type")

        if t == "object":
            merges += 1

        elif t == "array":
            merges += 1

    return merges

In [75]:
def compute_document_size(schema):

    # 1. Comptage scalaires
    outside, inside = count_scalars_with_arrays(schema)

    # 2. Taille scalaires hors tableau
    size_outside = (
        outside["int"]   * size_number +
        outside["string"]* size_string +
        outside["date"]  * size_date +
        outside["long"]  * size_longString
    )

    # 3. Taille scalaires DANS tableaux (par tableau)
    size_inside_total = 0

    for array_name, counts in inside.items():
        if array_name == "categories":
            avg = avg_length_cat
        elif array_name == "stocks":
            avg = avg_length_stock
        else:
            avg = 1  # fallback

        size_array = (
            counts["int"]   * size_number +
            counts["string"]* size_string +
            counts["date"]  * size_date +
            counts["long"]  * size_longString
        ) * avg

        size_inside_total += size_array

        # 4) KEYS
    keys_outside = sum(outside.values())

    keys_arrays = 0
    for array_name, counts in inside.items():

        total_scalars = (
            counts["int"] + counts["string"] + counts["date"] + counts["long"]
        )

        if array_name == "categories":
            avg = avg_length_cat
        elif array_name == "stocks":
            avg = avg_length_stock
        else:
            avg = 1

        keys_arrays += total_scalars * avg

    # merges auto detectés
    merges = count_merges(schema)

    # total keys
    keys_total = keys_outside + keys_arrays + merges
    size_keys_total = keys_total * size_keyValue

    # 6. total
    doc_size = size_outside + size_inside_total + size_keys_total 

    return {
        "outside_scalars": size_outside,
        "inside_scalars": size_inside_total,
        "nb_keys" : keys_total,
        "keys": size_keys_total,
        "doc_size": doc_size
    }

In [76]:
result = compute_document_size(schema) 
print(result)

{'outside_scalars': 640, 'inside_scalars': 19360, 'nb_keys': 616, 'keys': 7392, 'doc_size': 27392}
