In [1]:
# script_bdpm_nom_dci_atc.py
# dépendances : pandas, requests
# python >= 3.8 recommandé

import os
import requests
import pandas as pd

# ========== paramètres ==========
out_dir = "data/bdpm_files"
os.makedirs(out_dir, exist_ok=True)

# URLs officielles (page téléch. BDPM)
urls = {
    "CIS_bdpm.txt": "https://base-donnees-publique.medicaments.gouv.fr/index.php/download/file/CIS_bdpm.txt",
    "CIS_COMPO_bdpm.txt": "https://base-donnees-publique.medicaments.gouv.fr/index.php/download/file/CIS_COMPO_bdpm.txt",
    "CIS_MITM.txt": "https://base-donnees-publique.medicaments.gouv.fr/index.php/download/file/CIS_MITM.txt",
    "BTM_CIP_bdpm.txt": "https://base-donnees-publique.medicaments.gouv.fr/index.php/download/file/CIS_CIP_bdpm.txt"

}

In [2]:
# ========== téléch. si nécessaire ==========
for name, url in urls.items():
    path = os.path.join(out_dir, name)
    if not os.path.exists(path):
        print(f"Téléchargement de {name} ...")
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        with open(path, "wb") as f:
            f.write(r.content)
        print("   OK:", path)
    else:
        print("Déjà présent :", path)


Déjà présent : data/bdpm_files\CIS_bdpm.txt
Déjà présent : data/bdpm_files\CIS_COMPO_bdpm.txt
Déjà présent : data/bdpm_files\CIS_MITM.txt
Téléchargement de BTM_CIP_bdpm.txt ...
   OK: data/bdpm_files\BTM_CIP_bdpm.txt


In [8]:

# ========== lecture des fichiers ==========
# Rappel : fichiers tabulés, sans en-tête
cis_path = os.path.join(out_dir, "CIS_bdpm.txt")
compo_path = os.path.join(out_dir, "CIS_COMPO_bdpm.txt")
mitm_path = os.path.join(out_dir, "CIS_MITM.txt")


In [24]:
cip_path = os.path.join(out_dir, "BTM_CIP_bdpm.txt")
cip = pd.read_csv(cip_path, sep="\t", header=None, dtype=str, encoding="latin-1", low_memory=False)


In [25]:
cip

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,60002283,4949729,plaquette(s) PVC PVDC aluminium de 30 comprimÃ...,PrÃ©sentation active,DÃ©claration de commercialisation,16/03/2011,3400949497294,oui,100%,2434,2536,102,
1,60002283,4949770,plaquette(s) PVC PVDC aluminium de 90 comprimÃ...,PrÃ©sentation active,DÃ©claration de commercialisation,19/09/2011,3400949497706,oui,100%,6868,7144,276,
2,60003620,3696350,20 rÃ©cipient(s) unidose(s) polyÃ©thylÃ¨ne de ...,PrÃ©sentation active,DÃ©claration de commercialisation,30/11/2006,3400936963504,oui,65%,1281,1383,102,Ce mÃ©dicament peut Ãªtre pris en charge ou re...
3,60004277,3614582,plaquette(s) thermoformÃ©e(s) PVC aluminium de...,PrÃ©sentation active,DÃ©claration de commercialisation,11/06/2003,3400936145825,oui,65%,154,256,102,
4,60004487,3972519,plaquette(s) opaque(s) PVC-Aluminium de 30 com...,PrÃ©sentation active,DÃ©claration de commercialisation,12/12/2011,3400939725192,oui,65%,888,990,102,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20914,69998156,3450206,1 flacon(s) de 150 ml en verre brun avec ferme...,PrÃ©sentation active,DÃ©claration d'arrÃªt de commercialisation,30/11/2024,3400934502064,non,65%,8182,8284,102,
20915,69998640,3376562,1 tube(s) polyÃ©thylÃ¨ne aluminium avec canule...,PrÃ©sentation active,DÃ©claration d'arrÃªt de commercialisation,31/12/2024,3400933765620,non,65%,409,511,102,
20916,69998996,3008771,10 ampoule(s) polypropylÃ¨ne de 10 ml,PrÃ©sentation active,DÃ©claration de commercialisation,25/09/2017,3400930087718,oui,,,,,
20917,69999429,3602567,plaquette(s) PVC-Aluminium de 12 comprimÃ©(s),PrÃ©sentation active,DÃ©claration de commercialisation,16/01/2003,3400936025677,non,,,,,


In [27]:
# colonnes BDPM :
# 0 = CIS
# 1 = CIP7
# 6 = CIP13

cip_small = cip[[0, 1, 6]].copy()
cip_small.columns = ["CIS", "cip7", "cip13"]

# nettoyage
cip_small["CIS"] = cip_small["CIS"].str.strip()
cip_small["cip7"] = cip_small["cip7"].str.strip().fillna("")
cip_small["cip13"] = cip_small["cip13"].str.strip().fillna("")



In [29]:
cip_grouped = (
    cip_small.groupby("CIS")
    .agg({
        "cip7":  lambda s: "; ".join(sorted(set([x for x in s if x and pd.notna(x)]))),
        "cip13": lambda s: "; ".join(sorted(set([x for x in s if x and pd.notna(x)])))
    })
    .reset_index()
)


In [30]:
cip_grouped

Unnamed: 0,CIS,cip7,cip13
0,60002283,4949729; 4949770,3400949497294; 3400949497706
1,60003620,3696350,3400936963504
2,60004277,3614582,3400936145825
3,60004487,3972519,3400939725192
4,60004505,5507419,3400955074199
...,...,...,...
14564,69998003,3013811,3400930138113
14565,69998156,3450206,3400934502064
14566,69998640,3376562,3400933765620
14567,69998996,3008771,3400930087718


In [31]:
df_final = df_final.merge(cip_grouped, left_on="cis", right_on="CIS", how="left")
df_final.drop(columns=["CIS"], inplace=True)
df_final["cip"] = df_final["cip"].fillna("")


In [9]:
# Lecture brute (tout en str pour éviter problèmes de parsing)


cis = pd.read_csv(cis_path, sep="\t", header=None, dtype=str, encoding="latin-1", low_memory=False)
compo = pd.read_csv(compo_path, sep="\t", header=None, dtype=str, encoding="latin-1", low_memory=False)
mitm = pd.read_csv(mitm_path, sep="\t", header=None, dtype=str, encoding="latin-1", low_memory=False)



In [10]:
# ========== colonne indices (conformément au PDF officiel) ==========
# CIS_bdpm.txt : 0 = Code CIS, 1 = Dénomination du médicament (nom commercial)
# CIS_COMPO_bdpm.txt : 0 = Code CIS, 3 = Dénomination de la substance, 6 = Nature du composant (SA/ST)
# CIS_MITM.txt : 0 = Code CIS, 1 = Code ATC

# Renommer colonnes utiles
cis = cis[[0, 1]].copy()
cis.columns = ["CIS", "nom_commercial"]

# Certaines lignes peuvent contenir espaces; on nettoie
cis["CIS"] = cis["CIS"].str.strip()
cis["nom_commercial"] = cis["nom_commercial"].str.strip()

In [11]:

# Composition : on garde les colonnes 0 (CIS), 3 (dénomination substance), 6 (nature)
# ATTENTION : si le format change, adaptez les indices.
compo_small = compo[[0, 3, 6]].copy()
compo_small.columns = ["CIS", "dci", "nature"]
compo_small["CIS"] = compo_small["CIS"].str.strip()
compo_small["dci"] = compo_small["dci"].str.strip().fillna("")
compo_small["nature"] = compo_small["nature"].str.strip().fillna("")

In [12]:
# Filtrer sur nature == 'SA' (substance active)
compo_SA = compo_small[compo_small["nature"].str.upper() == "SA"].copy()


In [13]:
# Agréger les DCI par CIS (uniques et concaténés par ;)
compo_grouped = (compo_SA.groupby("CIS")["dci"]
                       .apply(lambda s: "; ".join(sorted(set([x for x in s if x and pd.notna(x)]))))
                       .reset_index()
                       .rename(columns={"dci": "dci_aggregated"}))

In [None]:
# vih = df_grouped[df_grouped["atc"].str.startswith("J05", na=False)]

In [14]:
# MITM : CIS -> ATC (col 0 CIS, col 1 ATC)
mitm_small = mitm[[0, 1]].copy()
mitm_small.columns = ["CIS", "atc"]
mitm_small["CIS"] = mitm_small["CIS"].str.strip()
mitm_small["atc"] = mitm_small["atc"].str.strip().fillna("")


In [15]:

# ========== fusion finale ==========
# fusionner nom commercial (cis) + dci + atc
df = cis.merge(compo_grouped, on="CIS", how="left")
df = df.merge(mitm_small, on="CIS", how="left")

In [16]:
# renommons colonnes finales et reordonnons
df_final = df[["CIS", "nom_commercial", "dci_aggregated", "atc"]].copy()
df_final.columns = ["cis", "nom_commercial", "dci", "atc"]

In [17]:
# Remplacer NaN par chaîne vide
df_final["dci"] = df_final["dci"].fillna("")
df_final["atc"] = df_final["atc"].fillna("")


In [42]:
# ========== sortie ==========
df_final=df_final.drop("cip13",axis=1)

In [43]:
df_final

Unnamed: 0,cis,nom_commercial,dci,atc,cip7
0,61266250,"A 313 200 000 UI POUR CENT, pommade","CONCENTRAT DE VITAMINE A SYNTHÉTIQUE, FORME HU...",,3000147
1,62869109,"A 313 50 000 U.I., capsule molle","CONCENTRAT DE VITAMINE A SYNTHÉTIQUE, FORME HU...",,3000064
2,69103878,"A.D.N. BOIRON, degré de dilution compris entre...",A.D.N. POUR PRÉPARATIONS HOMÉOPATHIQUES,,
3,61876780,"ABACAVIR ARROW 300 mg, comprimé pelliculé sécable",SULFATE D'ABACAVIR,J05AF06,3019221
4,63797011,"ABACAVIR SANDOZ 300 mg, comprimé pelliculé séc...",ABACAVIR,J05AF06,3008995
...,...,...,...,...,...
15818,64949486,"ZYRTECSET 10 mg, comprimé pelliculé sécable",CÉTIRIZINE (DICHLORHYDRATE DE),,3646168
15819,67337081,"ZYTIGA 500 mg, comprimé pelliculé",ACÉTATE D'ABIRATÉRONE,L02BX03,3007627
15820,63095061,"ZYVOXID 100 mg/5 ml, granulés pour suspension ...",LINÉZOLIDE,J01XX08,5651268
15821,63283095,"ZYVOXID 2 mg/ml, solution pour perfusion",LINÉZOLIDE,J01XX08,5811074


In [41]:
mot = "3595583"

mask = df_final.astype(str).apply(lambda col: col.str.contains(mot, case=False, na=False))
resultat = df_final[mask.any(axis=1)]

resultat

Unnamed: 0,cis,nom_commercial,dci,atc,cip7,cip13
4466,60234100,"DOLIPRANE 1000 mg, comprimé",PARACÉTAMOL,,3595583; 5636955,3400935955838; 3400956369553


In [46]:
df_final.to_excel("data/bdd_cip.xlsx",index=False)