In [2]:
# script_bdpm_nom_dci_atc.py
# dépendances : pandas, requests
# python >= 3.8 recommandé

import os
import requests
import pandas as pd

# ========== paramètres ==========
out_dir = "bdpm_files"
os.makedirs(out_dir, exist_ok=True)

# URLs officielles (page téléch. BDPM)
urls = {
    "CIS_bdpm.txt": "https://base-donnees-publique.medicaments.gouv.fr/index.php/download/file/CIS_bdpm.txt",
    "CIS_COMPO_bdpm.txt": "https://base-donnees-publique.medicaments.gouv.fr/index.php/download/file/CIS_COMPO_bdpm.txt",
    "CIS_MITM.txt": "https://base-donnees-publique.medicaments.gouv.fr/index.php/download/file/CIS_MITM.txt"
}

# ========== téléch. si nécessaire ==========
for name, url in urls.items():
    path = os.path.join(out_dir, name)
    if not os.path.exists(path):
        print(f"Téléchargement de {name} ...")
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        with open(path, "wb") as f:
            f.write(r.content)
        print("   OK:", path)
    else:
        print("Déjà présent :", path)

# ========== lecture des fichiers ==========
# Rappel : fichiers tabulés, sans en-tête
cis_path = os.path.join(out_dir, "CIS_bdpm.txt")
compo_path = os.path.join(out_dir, "CIS_COMPO_bdpm.txt")
mitm_path = os.path.join(out_dir, "CIS_MITM.txt")

# Lecture brute (tout en str pour éviter problèmes de parsing)


cis = pd.read_csv(cis_path, sep="\t", header=None, dtype=str, encoding="latin-1", low_memory=False)
compo = pd.read_csv(compo_path, sep="\t", header=None, dtype=str, encoding="latin-1", low_memory=False)
mitm = pd.read_csv(mitm_path, sep="\t", header=None, dtype=str, encoding="latin-1", low_memory=False)


# ========== colonne indices (conformément au PDF officiel) ==========
# CIS_bdpm.txt : 0 = Code CIS, 1 = Dénomination du médicament (nom commercial)
# CIS_COMPO_bdpm.txt : 0 = Code CIS, 3 = Dénomination de la substance, 6 = Nature du composant (SA/ST)
# CIS_MITM.txt : 0 = Code CIS, 1 = Code ATC

# Renommer colonnes utiles
cis = cis[[0, 1]].copy()
cis.columns = ["CIS", "nom_commercial"]

# Certaines lignes peuvent contenir espaces; on nettoie
cis["CIS"] = cis["CIS"].str.strip()
cis["nom_commercial"] = cis["nom_commercial"].str.strip()

# Composition : on garde les colonnes 0 (CIS), 3 (dénomination substance), 6 (nature)
# ATTENTION : si le format change, adaptez les indices.
compo_small = compo[[0, 3, 6]].copy()
compo_small.columns = ["CIS", "dci", "nature"]
compo_small["CIS"] = compo_small["CIS"].str.strip()
compo_small["dci"] = compo_small["dci"].str.strip().fillna("")
compo_small["nature"] = compo_small["nature"].str.strip().fillna("")

# Filtrer sur nature == 'SA' (substance active)
compo_SA = compo_small[compo_small["nature"].str.upper() == "SA"].copy()

# Agréger les DCI par CIS (uniques et concaténés par ;)
compo_grouped = (compo_SA.groupby("CIS")["dci"]
                       .apply(lambda s: "; ".join(sorted(set([x for x in s if x and pd.notna(x)]))))
                       .reset_index()
                       .rename(columns={"dci": "dci_aggregated"}))

# MITM : CIS -> ATC (col 0 CIS, col 1 ATC)
mitm_small = mitm[[0, 1]].copy()
mitm_small.columns = ["CIS", "atc"]
mitm_small["CIS"] = mitm_small["CIS"].str.strip()
mitm_small["atc"] = mitm_small["atc"].str.strip().fillna("")

# ========== fusion finale ==========
# fusionner nom commercial (cis) + dci + atc
df = cis.merge(compo_grouped, on="CIS", how="left")
df = df.merge(mitm_small, on="CIS", how="left")

# renommons colonnes finales et reordonnons
df_final = df[["CIS", "nom_commercial", "dci_aggregated", "atc"]].copy()
df_final.columns = ["cis", "nom_commercial", "dci", "atc"]

# Remplacer NaN par chaîne vide
df_final["dci"] = df_final["dci"].fillna("")
df_final["atc"] = df_final["atc"].fillna("")

# ========== sortie ==========
out_csv = "bdpm_medicaments_nom_dci_atc.csv"
df_final.to_csv(out_csv, index=False, encoding="utf-8")
print("Fichier généré :", out_csv)
print("Aperçu :")
print(df_final.head(30))


Déjà présent : bdpm_files\CIS_bdpm.txt
Déjà présent : bdpm_files\CIS_COMPO_bdpm.txt
Déjà présent : bdpm_files\CIS_MITM.txt
Fichier généré : bdpm_medicaments_nom_dci_atc.csv
Aperçu :
         cis                                     nom_commercial  \
0   61266250                A 313 200 000 UI POUR CENT, pommade   
1   62869109                   A 313 50 000 U.I., capsule molle   
2   69103878  A.D.N. BOIRON, degré de dilution compris entre...   
3   61876780  ABACAVIR ARROW 300 mg, comprimé pelliculé sécable   
4   63797011  ABACAVIR SANDOZ 300 mg, comprimé pelliculé séc...   
5   62401060  ABACAVIR VIATRIS 300 mg, comprimé pelliculé sé...   
6   68257528  ABACAVIR/LAMIVUDINE ACCORD 600 mg/300 mg, comp...   
7   62828870  ABACAVIR/LAMIVUDINE ARROW 600 mg/300 mg, compr...   
8   63431640  ABACAVIR/LAMIVUDINE BIOGARAN 600 mg/300 mg, co...   
9   65196479  ABACAVIR/LAMIVUDINE EG 600 mg/300 mg, comprimé...   
10  62170486  ABACAVIR/LAMIVUDINE MYLAN 600 mg/300 mg, compr...   
11  67720261  

In [8]:
df_final.to_excel("data/bdd.xlsx",index=False)

In [6]:
mot = "VIDEX"

mask = df_final.astype(str).apply(lambda col: col.str.contains(mot, case=False, na=False))
resultat = df_final[mask.any(axis=1)]

resultat

Unnamed: 0,cis,nom_commercial,dci,atc


In [1]:
import pandas as pd

# Charger les fichiers
cis = pd.read_csv("data/CIS_bdpm.txt", sep="\t", header=None, dtype=str, encoding="latin-1")
compo = pd.read_csv("data/CIS_COMPO_bdpm.txt", sep="\t", header=None, dtype=str, encoding="latin-1")
mitm = pd.read_csv("data/CIS_MITM.txt", sep="\t", header=None, dtype=str, encoding="latin-1")

# Colonnes
cis.columns = [
    "cis", "nom_commercial", "forme", "voie",
    "statut_amm", "procedure_amm", "etat_commercialisation",
    "date_amm", "statut_bdm", "num_amm_eu",
    "titulaire", "surveillance"
]

compo.columns = [
    "cis", "element", "code_substance", "dci",
    "dosage", "reference_dosage", "nature", "num_liaison"
]

mitm.columns = [
    "cis", "atc", "denomination", "lien_bdpm"
]

# Garder uniquement les substances actives
compo_sa = compo[compo["nature"] == "SA"]

# Merger les infos
df = cis.merge(compo_sa[["cis", "dci"]], on="cis", how="left")
df = df.merge(mitm[["cis", "atc"]], on="cis", how="left")

# Regrouper par médicament
df_grouped = df.groupby(["cis", "nom_commercial", "atc"])["dci"].apply(
    lambda x: " + ".join(sorted(set(filter(pd.notna, x))))
).reset_index()

# Nettoyage final
tous_medicaments = df_grouped[["nom_commercial", "dci", "atc"]].drop_duplicates()
tous_medicaments = tous_medicaments.sort_values("nom_commercial")

# Export
# tous_medicaments.to_excel("data/tous_medicaments.xlsx", index=False)


In [2]:
tous_medicaments

Unnamed: 0,nom_commercial,dci,atc
1585,"ABACAVIR ARROW 300 mg, comprimé pelliculé sécable",SULFATE D'ABACAVIR,J05AF06
3100,"ABACAVIR SANDOZ 300 mg, comprimé pelliculé séc...",ABACAVIR,J05AF06
1995,"ABACAVIR VIATRIS 300 mg, comprimé pelliculé sé...",ABACAVIR,J05AF06
2353,"ABACAVIR/LAMIVUDINE ARROW 600 mg/300 mg, compr...",LAMIVUDINE + SULFATE D'ABACAVIR,J05AR02
2820,"ABACAVIR/LAMIVUDINE BIOGARAN 600 mg/300 mg, co...",ABACAVIR + LAMIVUDINE,J05AR02
...,...,...,...
7054,"ZYPREXA VELOTAB 5 mg, comprimé orodispersible",OLANZAPINE,N05AH03
5989,"ZYTIGA 500 mg, comprimé pelliculé",ACÉTATE D'ABIRATÉRONE,L02BX03
2580,"ZYVOXID 100 mg/5 ml, granulés pour suspension ...",LINÉZOLIDE,J01XX08
2715,"ZYVOXID 2 mg/ml, solution pour perfusion",LINÉZOLIDE,J01XX08


In [5]:
mot = "DOLIPRANE"

mask = tous_medicaments.astype(str).apply(lambda col: col.str.contains(mot, case=False, na=False))
resultat = tous_medicaments[mask.any(axis=1)]

resultat

Unnamed: 0,nom_commercial,dci,atc


In [7]:
tous_medicaments["atc"].str[:3].unique()


array(['J05', 'A10', 'L01', 'N05', 'L02', 'N02', 'N07', 'L03', 'C07',
       'M01', 'G03', 'H01', 'S01', 'D06', 'B01', 'M05', 'A16', 'B03',
       'V08', 'B02', 'A05', 'J07', 'C09', 'G04', 'C02', 'C01', 'M04',
       'J01', 'L04', 'N01', 'N06', 'R03', 'A04', 'B05', 'C03', 'A11',
       'J02', 'C08', 'G02', 'J04', 'N04', 'P03', 'C10', 'P01', 'M03',
       'A03', 'V03', 'R01', 'H04', 'H03', 'B06', 'G01', 'D08', 'H02',
       'D07', 'J06', 'R06', 'P02', 'V04', 'N03', 'R07', 'A12', 'V09',
       'V07', 'D11', 'A02', 'H05', 'A07', 'A09', 'D10', 'M09', 'V10',
       'A06', 'D01', 'S02', 'V01', 'R05', 'D05'], dtype=object)

In [12]:
import numpy as np

In [2]:
import pandas as pd

df = pd.read_excel("../../arv introuvable.xlsx")
df

Unnamed: 0,Nom_introuvable,DCI,ATC,\n,Unnamed: 4
0,VIDEX,Didanosine,J05AF02,\n,
1,INVIRASE,Saquinavir,J05AE01,\n,
2,ZERIT,Stavudine,J05AF04,\n,
3,VIRACEPT,Nelfinavir,J05AE04,\n,
4,HIVID,Zalcitabine,J05AF03,\n,
...,...,...,...,...,...
93,HEPSERA,Adéfovir dipivoxil,J05AF08,\n,
94,EMIRIVINE,Emivirine,\n,EMIRIVINE faute de frappe molécule non commer...,
95,PACLITAXEL,Paclitaxel,L01CD01,\n,
96,AMDOXOVIR,Amdoxovir,\n,molécule non commercialisée,


In [5]:
df=df.iloc[:,:3] 

In [9]:
df["ATC"].unique()

array(['J05AF02', 'J05AE01', 'J05AF04', 'J05AE04', 'J05AF03', 'J05AG03',
       'J05AE02', 'J05AF13', 'J05AR04', 'J05AR06', 'J05AJ03', 'P01CX01',
       'J05AE05', 'J05AE06', 'L01XX05', 'J01FA10', 'J01EE01', 'J04AK02.',
       'J04AB02', 'J04AC01', 'J01FA09', 'J05AF07', 'J05AJ01', 'J05AG05',
       'J05AR27', 'J05AE10', 'J05AG04', 'J05AG02',
       'Pas de code ATC (médicament en développement)',
       'Pas de code ATC (pas d’AMM)', 'J05AR18',
       'Pas de code ATC (molécule en développement, programme interrompu)',
       'J05AR17', 'J05AX23', 'J05AX31', 'J05AX29', 'développement arrêté',
       'J05AJ04', 'J05AR22',
       'molécule expérimentale, jamais commercialisée', 'J05AR20',
       'J05AE07', 'J05AF08', 'J05AR09', 'molécule expérimentale',
       'J05AR14', 'J02AC01', 'J04AB04',
       'forme de ténofovir non spécifiée', 'J04BA02', '\n', 'L03AB10',
       'J05AX67', 'J05AJ03\xa0', 'J05AX14', 'J05AX07', 'J05AR10',
       'J05AG06', 'J05AR15', 'J05AB12', 'J05AB04\xa0', 'L03AB

In [13]:
df['ATC'] = (
    df['ATC']
        .str.replace(r'\s+', '', regex=True)   # enlever espaces, \n, \xa0
        .str.replace('.', '', regex=False)     # enlever les points
)

df['ATC'] = df['ATC'].where(df['ATC'].str.len() == 7, np.nan)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ATC'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ATC'] = df['ATC'].where(df['ATC'].str.len() == 7, np.nan)


In [16]:
df['ATC'].unique()

array(['J05AF02', 'J05AE01', 'J05AF04', 'J05AE04', 'J05AF03', 'J05AG03',
       'J05AE02', 'J05AF13', 'J05AR04', 'J05AR06', 'J05AJ03', 'P01CX01',
       'J05AE05', 'J05AE06', 'L01XX05', 'J01FA10', 'J01EE01', 'J04AK02',
       'J04AB02', 'J04AC01', 'J01FA09', 'J05AF07', 'J05AJ01', 'J05AG05',
       'J05AR27', 'J05AE10', 'J05AG04', 'J05AG02', nan, 'J05AR18',
       'J05AR17', 'J05AX23', 'J05AX31', 'J05AX29', 'J05AJ04', 'J05AR22',
       'J05AR20', 'J05AE07', 'J05AF08', 'J05AR09', 'J05AR14', 'J02AC01',
       'J04AB04', 'J04BA02', 'L03AB10', 'J05AX67', 'J05AX14', 'J05AX07',
       'J05AR10', 'J05AG06', 'J05AR15', 'J05AB12', 'J05AB04', 'J05AR19',
       'J01FF01', 'J05AP01', 'L03AB11', 'L01CD01', 'J05AR16'],
      dtype=object)

In [27]:
df

Unnamed: 0,Nom_introuvable,DCI,ATC
0,VIDEX,Didanosine,J05AF02
1,INVIRASE,Saquinavir,J05AE01
2,ZERIT,Stavudine,J05AF04
3,VIRACEPT,Nelfinavir,J05AE04
4,HIVID,Zalcitabine,J05AF03
...,...,...,...
93,HEPSERA,Adéfovir dipivoxil,J05AF08
94,EMIRIVINE,Emivirine,
95,PACLITAXEL,Paclitaxel,L01CD01
96,AMDOXOVIR,Amdoxovir,


In [17]:
df1=pd.read_excel("../../hep introuvable (1).xlsx")
df1

Unnamed: 0,Nom_introuvable,DCI,ATC,\n,Unnamed: 4
0,COPEGUS,Ribavirine,J05AP01,\n,
1,RÉBÉTOL,Ribavirine,J05AP01,\n,
2,VIRAFERON PEG,Péginterféron alfa-2b,L03AB10,\n,
3,VIRAFÉRON,Interféron alfa-2b,L03AB05,\n,
4,INTERFERON,\n,type non précisé,\n,
5,COPEGUS / RÉBÉTOL,Ribavirine,J05AP01,\n,
6,HEPSERA,Adéfovir dipivoxil,J05AF08,\n,
7,VIEKIRAX,Ombitasvir / Paritaprevir / Ritonavir,J05AX67,\n,
8,INCIVO,Télaprévir,J05AE11,\n,
9,SOFOSBUVIR/LEDIPASVIR,Sofosbuvir / Ledipasvir,J05AP51,\n,


In [19]:
df1=df1.iloc[:,:3]

In [20]:
df1['ATC'] = (
    df1['ATC']
        .str.replace(r'\s+', '', regex=True)   # enlever espaces, \n, \xa0
        .str.replace('.', '', regex=False)     # enlever les points
)

df1['ATC'] = df1['ATC'].where(df1['ATC'].str.len() <= 7, np.nan)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['ATC'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['ATC'] = df1['ATC'].where(df1['ATC'].str.len() <= 7, np.nan)


In [40]:
df_final = pd.concat([df, df1], axis=0, ignore_index=True)


In [41]:
df_final

Unnamed: 0,Nom_introuvable,DCI,ATC
0,VIDEX,Didanosine,J05AF02
1,INVIRASE,Saquinavir,J05AE01
2,ZERIT,Stavudine,J05AF04
3,VIRACEPT,Nelfinavir,J05AE04
4,HIVID,Zalcitabine,J05AF03
...,...,...,...
138,MYRCLUDEX,Bulevirtide,J05AX28
139,MK5172A,Grazoprevir,J05AP11
140,ETHAMBUTOL,éthambutol,J04AK02
141,TMC207,diarylquinolines,


In [43]:
vih = pd.read_excel("data/vih.xlsx")
vih.columns

Index(['nom_commercial', 'dci', 'atc'], dtype='object')

In [44]:
df_final.columns = vih.columns

In [45]:
df_final

Unnamed: 0,nom_commercial,dci,atc
0,VIDEX,Didanosine,J05AF02
1,INVIRASE,Saquinavir,J05AE01
2,ZERIT,Stavudine,J05AF04
3,VIRACEPT,Nelfinavir,J05AE04
4,HIVID,Zalcitabine,J05AF03
...,...,...,...
138,MYRCLUDEX,Bulevirtide,J05AX28
139,MK5172A,Grazoprevir,J05AP11
140,ETHAMBUTOL,éthambutol,J04AK02
141,TMC207,diarylquinolines,


In [46]:
vih = pd.concat([vih, df_final], axis=0, ignore_index=True)

In [47]:
vih

Unnamed: 0,nom_commercial,dci,atc
0,"ABACAVIR ARROW 300 mg, comprimé pelliculé sécable",SULFATE D'ABACAVIR,J05AF06
1,"ABACAVIR SANDOZ 300 mg, comprimé pelliculé séc...",ABACAVIR,J05AF06
2,"ABACAVIR VIATRIS 300 mg, comprimé pelliculé sé...",ABACAVIR,J05AF06
3,"ABACAVIR/LAMIVUDINE ARROW 600 mg/300 mg, compr...",LAMIVUDINE + SULFATE D'ABACAVIR,J05AR02
4,"ABACAVIR/LAMIVUDINE BIOGARAN 600 mg/300 mg, co...",ABACAVIR + LAMIVUDINE,J05AR02
...,...,...,...
8244,MYRCLUDEX,Bulevirtide,J05AX28
8245,MK5172A,Grazoprevir,J05AP11
8246,ETHAMBUTOL,éthambutol,J04AK02
8247,TMC207,diarylquinolines,


In [48]:
vih.to_excel("vih.xlsx")