In [38]:
import json

# ============================================================
# CONSTANTES
# ============================================================

size_number = 8
size_string = 80
size_date = 20
size_longString = 200
size_keyValue = 12

# ============================================================
# STATISTIQUES (#docs par collection)
# ============================================================

nb_docs = {
    "Cl": 10**7,
    "Prod": 10**5,
    "OL": 4 * 10**9,
    "Wa": 200,
}
nb_docs["St"] = nb_docs["Prod"] * nb_docs["Wa"]   # = 20M stocks

# ============================================================
# AVG_LENGTH ENTRE COLLECTIONS
# ============================================================

avg_length = {
    "Cl": {
        "Prod": 20,
        "OL": nb_docs["OL"] / nb_docs["Cl"],   # 400
        "Cat": None, "St": None, "Wa": None, "Supp": None, "Cl": None
    },

    "Prod": {
        "Cat": 2,                              # 2 catégories par produit
        "St": nb_docs["Wa"],                   # 200 stocks par produit
        "Wa": nb_docs["Wa"],                   # 200 warehouses par produit
        "Supp": 1,                             # 1 supplier / produit
        "OL": nb_docs["OL"] / nb_docs["Prod"], # 40 000 OL / produit
        "Cl": nb_docs["Cl"] / nb_docs["Prod"], # 100 clients / produit (en moyenne)
        "Prod": None
    },

    "Cat":  {"Prod": None, "Cl": None, "Wa": None, "OL": None, "Cat": None, "St": None, "Supp": None},

    "OL": {
        "Prod": 1, "Cl": 1, "Wa": 1, "St": 1,
        "Supp": None, "Cat": None, "OL": None
    },

    "Wa": {
        "St": nb_docs["Prod"],                  # 100k stocks par warehouse
        "Prod": nb_docs["Prod"] / nb_docs["Wa"],# 500 produits / warehouse
        "OL": nb_docs["OL"] / nb_docs["Wa"],    # 20M OL / warehouse
        "Cat": None, "Cl": None, "Wa": None, "Supp": None
    },

    "St": {"Prod": 1, "Wa": 1, "Cl": None, "OL": None, "Cat": None, "St": None, "Supp": None},

    "Supp": {
        "Prod": None,
        "Cl": None, "Cat": None, "Wa": None, "OL": None, "St": None, "Supp": None
    }
}

# ============================================================
# COLLECTION TYPE DETECTOR
# ============================================================

def guess_collection_name(schema):
    props = set(schema.get("properties", {}).keys())

    if {"IDP", "price"}.issubset(props): return "Prod"
    if {"title"}.issubset(props): return "Cat"
    if {"IDS", "SIRET"}.issubset(props): return "Supp"
    if { "location", "quantity"}.issubset(props): return "St"
    if {"IDC", "email"}.issubset(props): return "Cl"
    if {"date", "deliveryDate"}.issubset(props): return "OL"
    if {"IDW", "capacity"}.issubset(props): return "Wa"

    return "Unknown"

# ============================================================
# SCALAR COUNTER (avec parent logique des arrays)
# ============================================================

def count_scalars_with_arrays(schema):
    """
    Retourne :
      - counts_outside : scalaires hors tableaux
      - inside : dict[array_name] = { "counts": {...}, "parent": <collection logique> }
    """

    counts_outside = {"int": 0, "string": 0, "date": 0, "long": 0}
    inside = {}

    def ensure_array(arr_name, parent_coll):
        if arr_name not in inside:
            inside[arr_name] = {
                "counts": {"int": 0, "string": 0, "date": 0, "long": 0},
                "parent": parent_coll
            }

    def add_scalar(node_type, field_name, current_array_name):
        if current_array_name:
            target = inside[current_array_name]["counts"]
        else:
            target = counts_outside

        if node_type in ["integer", "number"]:
            target["int"] += 1
        elif node_type == "string":
            if field_name in ["description", "comment"]:
                target["long"] += 1
            else:
                target["string"] += 1
        elif node_type == "date":
            target["date"] += 1

    def explore(node, current_collection, field_name=None, current_array_name=None):
        if not isinstance(node, dict):
            return

        t = node.get("type")

        # ----- TABLEAU -----
        if t == "array":
            arr_name = field_name
            ensure_array(arr_name, current_collection)
            items = node.get("items")
            if isinstance(items, dict):
                explore(items, current_collection, field_name=None, current_array_name=arr_name)
            elif isinstance(items, list):
                for it in items:
                    explore(it, current_collection, field_name=None, current_array_name=arr_name)
            return

        # ----- OBJET -----
        if t == "object":
            detected = guess_collection_name(node)
            if detected != "Unknown":
                current_collection = detected

            props = node.get("properties", {})
            for k, sub in props.items():
                explore(sub, current_collection, field_name=k, current_array_name=current_array_name)
            return

        # ----- SCALAIRE -----
        add_scalar(t, field_name, current_array_name)

    root_coll = guess_collection_name(schema)
    explore(schema, root_coll)
    return counts_outside, inside

# ============================================================
# MERGE COUNTER (logique de collections)
# ============================================================

def count_merges(schema, parent_coll=None, is_root=True):
    """
    Compte les merges logiques :
    - chaque fois qu'on passe d'une collection logique à une autre (Prod -> Cat, Prod -> Supp, OL -> Prod, ...)
    - basé sur guess_collection_name sur les objets et les items des arrays
    """

    if not isinstance(schema, dict):
        return 0

    t = schema.get("type")
    merges = 0
    coll = None

    # collection logique de ce noeud
    if t == "object":
        coll = guess_collection_name(schema)
    elif t == "array":
        items = schema.get("items")
        if isinstance(items, dict):
            coll = guess_collection_name(items)

    # compter le merge si on change de collection (et que ce n'est pas la racine)
    if not is_root and coll not in (None, "Unknown") and coll != parent_coll:
        merges += 1
        parent_for_children = coll
    else:
        parent_for_children = parent_coll
        if coll not in (None, "Unknown") and parent_coll is None:
            parent_for_children = coll

    # descente récursive
    if t == "object":
        for sub in schema.get("properties", {}).values():
            merges += count_merges(sub, parent_for_children, is_root=False)
    elif t == "array":
        items = schema.get("items")
        if isinstance(items, dict):
            merges += count_merges(items, parent_for_children, is_root=False)
        elif isinstance(items, list):
            for it in items:
                merges += count_merges(it, parent_for_children, is_root=False)

    return merges

# ============================================================
# MAPPING ARRAY → COLLECTION ENFANT
# ============================================================

array_to_collection = { "categories": "Cat", "supplier" : "Supp", "stock": "St", "orderline": "OL", "product": "Prod", "client": "Cl", "Warehouse" : "Wa" }


# ============================================================
# CALCUL DE TAILLE D'UN DOCUMENT
# ============================================================

def compute_document_size(schema, nb_docs_map):
    parent = guess_collection_name(schema)
    outside, inside = count_scalars_with_arrays(schema)
    avg_used = {}

    # scalaires hors tableaux
    size_outside = (
        outside["int"] * size_number +
        outside["string"] * size_string +
        outside["date"] * size_date +
        outside["long"] * size_longString
    )

    # scalaires dans tableaux
    size_inside_total = 0
    for array_name, info in inside.items():
        counts = info["counts"]
        parent_for_avg = info["parent"] or parent
        child = array_to_collection.get(array_name, "Unknown")

        avg = avg_length.get(parent_for_avg, {}).get(child, 1)
        if avg is None:
            avg = 1

        avg_used[array_name] = avg

        size_arr = (
            counts["int"] * size_number +
            counts["string"] * size_string +
            counts["date"] * size_date +
            counts["long"] * size_longString
        ) * avg

        size_inside_total += size_arr

    # keys
    keys_outside = sum(outside.values())
    keys_arrays = sum(
        sum(info["counts"].values()) * avg_used[name]
        for name, info in inside.items()
    )
    merges = count_merges(schema)

    size_keys_total = (keys_outside + keys_arrays + merges) * size_keyValue

    # taille finale
    doc_size = size_outside + size_inside_total + size_keys_total
    nb = nb_docs_map.get(parent, 1)
    collection_size = doc_size * nb

    return {
        "collection": parent,
        "nb_docs": nb,
        "avg_lengths": avg_used,
        "size_outside": size_outside,
        "size_inside": size_inside_total,
        "size_keys": size_keys_total,
        "merges": merges,
        "doc_size": doc_size,
        "collection_size": collection_size
    }

# ============================================================
# SCHEMAS JSON DB1 : Prod{[Cat],Supp}, St, Wa, OL, Cl
# ============================================================

schema_prod = {
  "type": "object",
  "properties": {
    "IDP": {"type": "integer"},
    "name": {"type": "string"},
    "price": {"type": "number"},
    "brand": {"type": "string"},
    "description": {"type": "string"},
    "image_url": {"type": "string"},

    "categories": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {"title": {"type": "string"}}
      }
    },

    "supplier": {
      "type": "object",
      "properties": {
        "IDS": {"type": "integer"},
        "name": {"type": "string"},
        "SIRET": {"type": "integer"},
        "headOffice": {"type": "string"},
        "revenue": {"type": "number"}
      }
    }
  }
}

schema_st = {
  "type": "object",
  "properties": {
    "IDW": {"type": "integer"},
    "IDP": {"type": "integer"},
    "quantity": {"type": "integer"},
    "location": {"type": "string"}
  }
}

schema_wa = {
  "type": "object",
  "properties": {
    "IDW": {"type": "integer"},
    "address": {"type": "string"},
    "capacity": {"type": "integer"}
  }
}

schema_ol = {
  "type": "object",
  "properties": {
    "IDP": {"type": "integer"},
    "IDC": {"type": "integer"},
    "date": {"type": "date"},
    "deliveryDate": {"type": "date"},
    "quantity": {"type": "integer"},
    "comment": {"type": "string"},
    "grade": {"type": "integer"}
  }
}

schema_cl = {
  "type": "object",
  "properties": {
    "IDC": {"type": "integer"},
    "ln": {"type": "string"},
    "fn": {"type": "string"},
    "address": {"type": "string"},
    "nationality": {"type": "string"},
    "birthDate": {"type": "date"},
    "email": {"type": "string"}
  }
}

# ============================================================
# DB1 : SET DES COLLECTIONS
# ============================================================

db1_collections = {
    "Prod": schema_prod,
    "St":   schema_st,
    "Wa":   schema_wa,
    "OL":   schema_ol,
    "Cl":   schema_cl
}

# ============================================================
# CALCUL TAILLE DB COMPLÈTE
# ============================================================

def compute_database_size(collections, nb_docs_map):
    total = 0
    details = {}
    for name, schema in collections.items():
        result = compute_document_size(schema, nb_docs_map)
        details[name] = result["collection_size"]
        total += result["collection_size"]
    return total, details

def pretty(b):
    return f"{b:,.2f} B | {b/10**9:,.4f} GB"

# ============================================================
# EXECUTION DB1
# ============================================================

total_db1_size, details = compute_database_size(db1_collections, nb_docs)

print("\n=========== DB1 SIZE ===========")
for coll, size in details.items():
    print(f"{coll:6s}: {pretty(size)}")
print("\nTOTAL DB1 SIZE =")
print(pretty(total_db1_size))
print("================================\n")

# ============================================================
# SCHEMAS DES 4 COLLECTIONS À TESTER
# ============================================================

# 1) Prod{[Cat], Supp}
schema_prod_cat_supp = schema_prod  # même schéma que ci-dessus

# 2) St{Prod{[Cat], Supp}}
schema_st_prod_cat_supp = {
    "type": "object",
    "properties": {
        "IDW": {"type": "integer"},
        "quantity": {"type": "integer"},
        "location": {"type": "string"},

        "product": {
            "type": "object",
            "properties": {
                "IDP": {"type": "integer"},
                "name": {"type": "string"},
                "price": {"type": "number"},
                "brand": {"type": "string"},
                "description": {"type": "string"},
                "image_url": {"type": "string"},

                "categories": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "title": {"type": "string"}
                        }
                    }
                },

                "supplier": {
                    "type": "object",
                    "properties": {
                        "IDS": {"type": "integer"},
                        "name": {"type": "string"},
                        "SIRET": {"type": "integer"},
                        "headOffice": {"type": "string"},
                        "revenue": {"type": "number"}
                    }
                }
            }
        }
    }
}

# 3) OL{Prod{[Cat], Supp}}
schema_ol_prod_cat_supp = {
    "type": "object",
    "properties": {
        "IDC": {"type": "integer"},
        "date": {"type": "date"},
        "deliveryDate": {"type": "date"},
        "quantity": {"type": "integer"},
        "comment": {"type": "string"},
        "grade": {"type": "integer"},

        "product": {
            "type": "object",
            "properties": {
                "IDP": {"type": "integer"},
                "name": {"type": "string"},
                "price": {"type": "number"},
                "brand": {"type": "string"},
                "description": {"type": "string"},
                "image_url": {"type": "string"},

                "categories": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "title": {"type": "string"}
                        }
                    }
                },

                "supplier": {
                    "type": "object",
                    "properties": {
                        "IDS": {"type": "integer"},
                        "name": {"type": "string"},
                        "SIRET": {"type": "integer"},
                        "headOffice": {"type": "string"},
                        "revenue": {"type": "number"}
                    }
                }
            }
        }
    }
}

# 4) Prod{[Cat], Supp, [OL]}
schema_prod_cat_supp_ol = {
    "type": "object",
    "properties": {
        "IDP": {"type": "integer"},
        "name": {"type": "string"},
        "price": {"type": "number"},
        "brand": {"type": "string"},
        "description": {"type": "string"},
        "image_url": {"type": "string"},

        "categories": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "title": {"type": "string"}
                }
            }
        },

        "supplier": {
            "type": "object",
            "properties": {
                "IDS": {"type": "integer"},
                "name": {"type": "string"},
                "SIRET": {"type": "integer"},
                "headOffice": {"type": "string"},
                "revenue": {"type": "number"}
            }
        },

        "orderline": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "IDC": {"type": "integer"},
                    "date": {"type": "date"},
                    "deliveryDate": {"type": "date"},
                    "quantity": {"type": "integer"},
                    "comment": {"type": "string"},
                    "grade": {"type": "integer"}
                }
            }
        }
    }
}

# ============================================================
# RUN DES 4 VARIANTES D’EMBEDDING
# ============================================================

collections_to_test = {
    "Prod{[Cat], Supp}": schema_prod_cat_supp,
    "St{Prod{[Cat],Supp}}": schema_st_prod_cat_supp,
    "OL{Prod{[Cat],Supp}}": schema_ol_prod_cat_supp,
    "Prod{[Cat], Supp, [OL]}": schema_prod_cat_supp_ol
}

print("\n=========== SIZE OF EMBEDDING VARIANTS ===========\n")

for label, schema in collections_to_test.items():
    result = compute_document_size(schema, nb_docs)
    print(f"--- {label} ---")
    print(f"Collection   : {result['collection']}")
    print(f"Doc size     : {result['doc_size']:,} B")
    print(f"Total size   : {result['collection_size'] * 10**(-9):,.4f} GB")
    print()

print("\n==================================================\n") 


Prod  : 98,000,000.00 B | 0.0980 GB
St    : 3,040,000,000.00 B | 3.0400 GB
Wa    : 26,400.00 B | 0.0000 GB
OL    : 1,424,000,000,000.00 B | 1,424.0000 GB
Cl    : 5,120,000,000.00 B | 5.1200 GB

TOTAL DB1 SIZE =
1,432,258,026,400.00 B | 1,432.2580 GB



--- Prod{[Cat], Supp} ---
Collection   : Prod
Doc size     : 980 B
Total size   : 0.0980 GB

--- St{Prod{[Cat],Supp}} ---
Collection   : St
Doc size     : 1,124 B
Total size   : 22.4800 GB

--- OL{Prod{[Cat],Supp}} ---
Collection   : OL
Doc size     : 1,328 B
Total size   : 5,312.0000 GB

--- Prod{[Cat], Supp, [OL]} ---
Collection   : Prod
Doc size     : 13,440,992.0 B
Total size   : 1,344.0992 GB



