In [26]:
import pandas as pd

df = pd.read_excel("data/out/Prescription.xlsx")

# Nettoyage DCI 

## Étape 1 — Normalisation de base

In [95]:
import numpy as np

df["DCI"] = (
    df["DCI"]
      .astype(str)                # on force en string pour capter "NA", "nan", etc.
      .str.strip()
      .str.upper()
      .replace(
          {
              "NAN": np.nan,
              "NA": np.nan,
              "NULL": np.nan,
              "NONE": np.nan,
              "": np.nan
          }
      )
      .str.replace("’", "'", regex=False)
      .str.replace("-", " ", regex=False)
)


In [27]:
def normalize_text(s):
    return (
        s.upper()
         .strip()
         .replace("’", "'")
         .replace("-", " ")
    )


In [70]:
df["DCI_NORM"] = df["DCI"].astype(str).map(normalize_text)


## Étape 2 — Liste officielle des variations chimiques

In [None]:
# Cette liste vient des bases ANSM / EMA / WHO

In [96]:
SALTS = [
    "SULFATE", "SULFATE DE", "CHLORHYDRATE", "CHLORHYDRATE DE",
    "PHOSPHATE", "PHOSPHATE DE",
    "ACETATE", "ACETATE DE",
    "BROMURE", "BROMURE DE",
    "IODURE", "IODURE DE",
    "NITRATE", "NITRATE DE",
    "TARTRATE", "TARTRATE DE",
    "MESILATE", "MESYLATE",
    "FUMARATE", "FUMARATE DE",
    "CITRATE", "CITRATE DE",
    "SODIQUE", "POTASSIQUE", "CALCIQUE",
    "MONOHYDRATE", "DIHYDRATE", "TRIHYDRATE",
]


## Étape 3 — Supprimer les formes chimiques

In [97]:
import re

pattern = r"\b(" + "|".join(SALTS) + r")\b\s*(DE|D')?\s*"

df["DCI_MERE"] = (
    df["DCI_NORM"]
      .str.replace(pattern, "", regex=True)
      .str.replace(r"\s{2,}", " ", regex=True)
      .str.strip()
)


## Étape 4 — Cas inversés

In [98]:
df["DCI_MERE"] = (
    df["DCI_MERE"]
      .str.replace(r"\b(" + "|".join(SALTS) + r")$", "", regex=True)
      .str.strip()
)


In [75]:
df.head()

Unnamed: 0,ID,Date,Prelibt,CIP,Prepost,DCI,ATC,DOSE,Freq,Durée,dosage,dosage_nature,cip7,Med,DCI_NORM,DCI_MERE,PRELIBT_NORM,PRELIBT_BASE
0,1,2004-04-14,BACTRIM cp Ad,3001069,"1 Comprimé(s), 1 fois / jour pendant 30 jour(s)",SULFAMETHOXAZOLE; TRIMETHOPRIME,,1 Comprimé,1 fois / jour,30 jour(s),400 mg; 80 mg,un comprimé,3001069.0,BACTRIM,SULFAMETHOXAZOLE; TRIMETHOPRIME,SULFAMETHOXAZOLE; TRIMETHOPRIME,BACTRIM CP AD,BACTRIM CP AD
1,1,2004-04-14,KALETRA caps,3566794,"3 Capsule(s), Toutes les 12 heures pendant 30 ...",LOPINAVIR; RITONAVIR,J05AR10,3 Capsule,Toutes les 12 heures,30 jour(s),,,,KALETRA,LOPINAVIR; RITONAVIR,LOPINAVIR; RITONAVIR,KALETRA CAPS,KALETRA CAPS
2,1,2004-04-14,COMBIVIR cp enrobé,3466271,"1 Comprimé(s), Toutes les 12 heures pendant 30...",LAMIVUDINE; ZIDOVUDINE,J05AR01,1 Comprimé,Toutes les 12 heures,30 jour(s),150 mg; 300 mg,un comprimé,3466271.0,COMBIVIR,LAMIVUDINE; ZIDOVUDINE,LAMIVUDINE; ZIDOVUDINE,COMBIVIR CP ENROBÉ,COMBIVIR CP ENROBÉ
3,2,2013-10-01,TRUVADA 200MG/245MG CPR 30,3656563,1 comprimé par jour pendant 1 mois,EMTRICITABINE; TENOFOVIR DISOPROXIL,J05AR03,1 comprimé,par jour,1 mois,200 mg; 300 mg,un comprimé,3656563.0,TRUVADA,EMTRICITABINE; TENOFOVIR DISOPROXIL,EMTRICITABINE; TENOFOVIR DISOPROXIL,TRUVADA 200MG/245MG CPR 30,TRUVADA / CPR 30
4,2,2013-10-01,ISENTRESS 400MG CPR 60,3830848,2 comprimés par jour pendant 1 mois,RALTEGRAVIR POTASSIQUE,J05AJ01,2 comprimés,par jour,1 mois,"434,4 mg",un comprimé,3830848.0,ISENTRESS,RALTEGRAVIR POTASSIQUE,RALTEGRAVIR,ISENTRESS 400MG CPR 60,ISENTRESS CPR 60


In [131]:
df.to_excel("df.xlsx",index=False)

# Grouper dci 2 

## Étape 1 — Nettoyage du libellé

In [99]:
def clean_prelibt(s):
    return (
        str(s).upper()
              .replace("’", "'")
              .replace("-", " ")
              .strip()
    )

df["PRELIBT_NORM"] = df["Prelibt"].map(clean_prelibt)


## Étape 2 — Grouper les médicaments par DCI

In [100]:
grouped2 = (
    df.groupby("PRELIBT_NORM")["DCI_MERE"]
      .agg(
          nb_medicaments="nunique",
          liste_DCI=lambda x: sorted(set(x))
      )
      .reset_index()
)

In [102]:
# grouped2.to_excel("grouped2.xlsx")

In [78]:
grouped2

Unnamed: 0,PRELIBT_NORM,nb_medicaments,liste_DCI
0,(CARBOSYLANE),1,[CHARBON ACTIVE; SIMETICONE]
1,(CYTEAL),1,[CHLOROCRESOL;HEXAMIDINE; SOLUTION DE DIGLUCON...
2,(DEXERYL CR DERM 500G),1,[<NA>]
3,(DEXERYL),1,[<NA>]
4,(ELEVIT VITAMINE B9),1,[ACIDE FOLIQUE;ALPHA TOCOPHEROL; ASCORBATE DE ...
...,...,...,...
15973,ZYRTECSET 10MG CPR SECABLE 7 PR12,1,[CETIRIZINE]
15974,ZYTIGA 250MG CPR 120,1,[ABIRATERONE]
15975,ZYTIGA 500MG CPR 60,1,[ABIRATERONE]
15976,ZYVOXID 2MG/ML SOL PERF 300ML 10,1,[LINEZOLIDE]


## Étape 3 — Détection automatique des divergences

In [112]:
grouped2["divergence"] = grouped2["nb_medicaments"] > 1


In [113]:
suspects2 = grouped2[grouped2["divergence"]]


In [114]:
suspects2

Unnamed: 0,PRELIBT_NORM,nb_medicaments,liste_DCI,divergence,dci_similarity,categorie
20,ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ P...,3,"[ABACAVIR ; LAMIVUDINE, ABACAVIR; LAMIVUDINE, ...",True,0.975610,typo_normalisation
21,ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ P...,2,"[ABACAVIR ; LAMIVUDINE, ABACAVIR; LAMIVUDINE]",True,0.975610,typo_normalisation
210,"ACIDE FOLIQUE 0,4 MG COMPRIMÉ",2,"[ACIDE ACETYLSALICYLIQUE, ACIDE FOLIQUE]",True,0.611111,combinaison_possible
533,ALGINATE DE SODIUM + BICARBONATE DE SODIUM 500...,2,"[ALGINATE DE SODIUM; BICARBONATE DE SODIUM, SO...",True,0.509091,combinaison_possible
534,ALGINATE DE SODIUM + BICARBONATE DE SODIUM 500...,2,"[ALGINATE DE SODIUM; BICARBONATE DE SODIUM, SO...",True,0.509091,combinaison_possible
...,...,...,...,...,...,...
15071,"VALSARTAN/HYD 160/12,5MG ARW CPR30",2,"[<NA>, HYDROCHLOROTHIAZIDE; VALSARTAN]",True,0.058824,erreur_grave
15074,VALSARTAN/HYD 160/25MG ARW CPR 30,2,"[<NA>, HYDROCHLOROTHIAZIDE; VALSARTAN]",True,0.058824,erreur_grave
15456,VITAMINE C 1 000 MG COMPRIMÉ EFFERVESCENT,2,"[ACIDE ASCORBIQUE, ASCORBIQUE]",True,0.769231,sel_forme
15457,VITAMINE C 1 G COMPRIMÉ EFFERVESCENT,2,"[ACIDE ASCORBIQUE, ASCORBIQUE]",True,0.769231,sel_forme


## Étape 4 — Mesurer la similarité entre dci

In [115]:
from difflib import SequenceMatcher
from itertools import combinations

def dci_similarity(dcis):
    if len(dcis) < 2:
        return 1.0
    sims = [
        SequenceMatcher(None, a, b).ratio()
        for a, b in combinations(dcis, 2)
    ]
    return max(sims)  # max, pas moyenne


In [116]:
grouped2["dci_similarity"] = grouped2["liste_DCI"].apply(dci_similarity)


In [117]:
def categorie(sim):
    if sim >= 0.9:
        return "typo_normalisation"
    elif sim >= 0.75:
        return "sel_forme"
    elif sim >= 0.5:
        return "combinaison_possible"
    else:
        return "erreur_grave"


In [118]:
grouped2["categorie"] = grouped2["dci_similarity"].apply(categorie)


In [119]:
grouped2[grouped2["categorie"]=="typo_normalisation"]

Unnamed: 0,PRELIBT_NORM,nb_medicaments,liste_DCI,divergence,dci_similarity,categorie
0,(CARBOSYLANE),1,[CHARBON ACTIVE; SIMETICONE],False,1.0,typo_normalisation
1,(CYTEAL),1,[CHLOROCRESOL;HEXAMIDINE; SOLUTION DE DIGLUCON...,False,1.0,typo_normalisation
2,(DEXERYL CR DERM 500G),1,[<NA>],False,1.0,typo_normalisation
3,(DEXERYL),1,[<NA>],False,1.0,typo_normalisation
4,(ELEVIT VITAMINE B9),1,[ACIDE FOLIQUE;ALPHA TOCOPHEROL; ASCORBATE DE ...,False,1.0,typo_normalisation
...,...,...,...,...,...,...
15973,ZYRTECSET 10MG CPR SECABLE 7 PR12,1,[CETIRIZINE],False,1.0,typo_normalisation
15974,ZYTIGA 250MG CPR 120,1,[ABIRATERONE],False,1.0,typo_normalisation
15975,ZYTIGA 500MG CPR 60,1,[ABIRATERONE],False,1.0,typo_normalisation
15976,ZYVOXID 2MG/ML SOL PERF 300ML 10,1,[LINEZOLIDE],False,1.0,typo_normalisation


In [120]:
from collections import Counter

def dci_majoritaire(dcis):
    return Counter(dcis).most_common(1)[0][0]

grouped2["DCI_CORRIGEE"] = grouped2["liste_DCI"].apply(dci_majoritaire)


In [129]:
grouped2[grouped2["categorie"] == "erreur_grave"].to_excel(
    "audit_erreurs_dci.xlsx", index=False
)


In [126]:
g=grouped2.drop_duplicates(subset="liste_DCI")
g.to_excel("grouped2_unique.xlsx",index=False)

In [128]:
grouped2.to_excel("grouped2.xlsx",index=False)


# appliquer correctionn

## Étape 1 — Construire la table de correction

In [None]:
corrections = grouped2[
    grouped2["categorie"].isin(["typo_normalisation", "sel_forme"])
][["PRELIBT_NORM", "DCI_CORRIGEE"]]


## Étape 2 — Joindre les corrections au DataFrame initial

In [None]:
df = df.merge(
    corrections,
    on="PRELIBT_NORM",
    how="left"
)


## Étape 3 — Appliquer la correction (sans perdre l’original)

In [None]:
df["DCI_FINALE"] = df["DCI_CORRIGEE"].combine_first(df["DCI_MERE"])


## Étape 4 — Traçabilité (très important)

In [None]:
df = df.merge(
    grouped2[["PRELIBT_NORM", "categorie"]],
    on="PRELIBT_NORM",
    how="left"
)


## Étape 5 — Vérification rapide

In [None]:
df[df["DCI_CORRIGEE"].notna()][
    ["PRELIBT_NORM", "DCI_MERE", "DCI_CORRIGEE", "DCI_FINALE", "categorie"]
].head(20)


## Cas à NE PAS corriger automatiquement

In [None]:
df.loc[
    df["categorie"].isin(["combinaison_possible", "erreur_grave"]),
    "DCI_FINALE"
] = df["DCI_MERE"]


# Grouper dci 

## Étape 1 — Nettoyage du libellé

In [35]:
def clean_prelibt(s):
    return (
        str(s).upper()
              .replace("’", "'")
              .replace("-", " ")
              .strip()
    )

df["PRELIBT_NORM"] = df["Prelibt"].map(clean_prelibt)


## Étape 2 — Grouper les médicaments par DCI

In [36]:
grouped = (
    df.groupby("DCI_MERE")["PRELIBT_NORM"]
      .agg(
          nb_medicaments="nunique",
          liste_prelibt=lambda x: sorted(set(x))
      )
      .reset_index()
)


In [55]:
grouped

Unnamed: 0,DCI_MERE,nb_medicaments,liste_prelibt,divergence
0,ABACAVIR,14,"[ABACAVIR 20 MG/ML SOLUTION BUVABLE, ABACAVIR ...",True
1,ABACAVIR ; LAMIVUDINE,8,[ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ ...,True
2,ABACAVIR; LAMIVUDINE,14,[ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ ...,True
3,ABIRATERONE,3,"[ABIRATÉRONE 500 MG COMPRIMÉ, ZYTIGA 250MG CPR...",True
4,ACAMPROSATE,6,[ACAMPROSATE 333 MG COMPRIMÉ ENROBÉ GASTRORÉSI...,True
...,...,...,...,...
1494,ZOLMITRIPTAN,14,"[ZOLMITRIPTAN 2,5 MG COMPRIMÉ ORODISPERSIBLE, ...",True
1495,ZOLPIDEM,34,[STILNOX 10 MG COMPRIMÉ PELLICULÉ SÉCABLE BOÎT...,True
1496,ZONISAMIDE,3,"[ZONEGRAN 100MG GELULE 56, ZONEGRAN 25MG GELUL...",True
1497,ZOPICLONE,50,"[IMOVANE 3,75 MG COMPRIMÉ PELLICULÉ BOÎTE DE 1...",True


## Étape 3 — Détection automatique des divergences

In [38]:
grouped["divergence"] = grouped["nb_medicaments"] > 1


In [39]:
suspects = grouped[grouped["divergence"]]


In [40]:
suspects

Unnamed: 0,DCI_MERE,nb_medicaments,liste_prelibt,divergence
0,ABACAVIR,14,"[ABACAVIR 20 MG/ML SOLUTION BUVABLE, ABACAVIR ...",True
1,ABACAVIR ; LAMIVUDINE,8,[ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ ...,True
2,ABACAVIR; LAMIVUDINE,14,[ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ ...,True
3,ABIRATERONE,3,"[ABIRATÉRONE 500 MG COMPRIMÉ, ZYTIGA 250MG CPR...",True
4,ACAMPROSATE,6,[ACAMPROSATE 333 MG COMPRIMÉ ENROBÉ GASTRORÉSI...,True
...,...,...,...,...
1494,ZOLMITRIPTAN,14,"[ZOLMITRIPTAN 2,5 MG COMPRIMÉ ORODISPERSIBLE, ...",True
1495,ZOLPIDEM,34,[STILNOX 10 MG COMPRIMÉ PELLICULÉ SÉCABLE BOÎT...,True
1496,ZONISAMIDE,3,"[ZONEGRAN 100MG GELULE 56, ZONEGRAN 25MG GELUL...",True
1497,ZOPICLONE,50,"[IMOVANE 3,75 MG COMPRIMÉ PELLICULÉ BOÎTE DE 1...",True


## Étape 4 — Mesurer la similarité entre libellés 

In [41]:
import re

def strip_dosage(s):
    s = re.sub(r"\b\d+(\.\d+)?\s*(MG|G|ML|MCG|UI)\b", "", s)
    s = re.sub(r"\b(COMPRIME|GELULE|SOLUTION|SIROP)\b", "", s)
    return re.sub(r"\s{2,}", " ", s).strip()


In [42]:
df["PRELIBT_BASE"] = df["PRELIBT_NORM"].map(strip_dosage)


In [44]:
from difflib import SequenceMatcher
from itertools import combinations

def avg_similarity(values):
    if len(values) < 2:
        return 1.0
    sims = [
        SequenceMatcher(None, a, b).ratio()
        for a, b in combinations(values, 2)
    ]
    return sum(sims) / len(sims)


In [45]:
sim = (
    df.groupby("DCI_MERE")["PRELIBT_BASE"]
      .apply(lambda x: avg_similarity(set(x)))
      .reset_index(name="similarite_moyenne")
)


In [47]:
sim

Unnamed: 0,DCI_MERE,similarite_moyenne
0,ABACAVIR,0.433995
1,ABACAVIR ; LAMIVUDINE,0.645076
2,ABACAVIR; LAMIVUDINE,0.453669
3,ABIRATERONE,0.495346
4,ACAMPROSATE,0.482199
...,...,...
1494,ZOLMITRIPTAN,0.558415
1495,ZOLPIDEM,0.598043
1496,ZONISAMIDE,0.818182
1497,ZOPICLONE,0.593918


## Étape 5 — Identifier les incohérences

In [48]:
analysis = grouped.merge(sim, on="DCI_MERE")


In [49]:
analysis["incoherent"] = analysis["similarite_moyenne"] < 0.75


In [51]:
analysis["score_coherence"] = (
    analysis["similarite_moyenne"]
    * (1 / analysis["nb_medicaments"])
)


In [52]:
analysis

Unnamed: 0,DCI_MERE,nb_medicaments,liste_prelibt,divergence,similarite_moyenne,incoherent,score_coherence
0,ABACAVIR,14,"[ABACAVIR 20 MG/ML SOLUTION BUVABLE, ABACAVIR ...",True,0.433995,True,0.031000
1,ABACAVIR ; LAMIVUDINE,8,[ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ ...,True,0.645076,True,0.080635
2,ABACAVIR; LAMIVUDINE,14,[ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ ...,True,0.453669,True,0.032405
3,ABIRATERONE,3,"[ABIRATÉRONE 500 MG COMPRIMÉ, ZYTIGA 250MG CPR...",True,0.495346,True,0.165115
4,ACAMPROSATE,6,[ACAMPROSATE 333 MG COMPRIMÉ ENROBÉ GASTRORÉSI...,True,0.482199,True,0.080367
...,...,...,...,...,...,...,...
1494,ZOLMITRIPTAN,14,"[ZOLMITRIPTAN 2,5 MG COMPRIMÉ ORODISPERSIBLE, ...",True,0.558415,True,0.039887
1495,ZOLPIDEM,34,[STILNOX 10 MG COMPRIMÉ PELLICULÉ SÉCABLE BOÎT...,True,0.598043,True,0.017589
1496,ZONISAMIDE,3,"[ZONEGRAN 100MG GELULE 56, ZONEGRAN 25MG GELUL...",True,0.818182,False,0.272727
1497,ZOPICLONE,50,"[IMOVANE 3,75 MG COMPRIMÉ PELLICULÉ BOÎTE DE 1...",True,0.593918,True,0.011878


In [53]:
analysis.sort_values("similarite_moyenne").head(20)


Unnamed: 0,DCI_MERE,nb_medicaments,liste_prelibt,divergence,similarite_moyenne,incoherent,score_coherence
52,ACIDE SALICYLIQUE,3,"[CIELLA 0,1% SOL LAV OPHT DOSE 20, CORICIDE LE...",True,0.147894,True,0.049298
328,CALCITRIOL,2,"[ROCALTROL 0,25MCG CAPS 30, SILKIS 3MCG/G POM ...",True,0.162162,True,0.081081
883,IRBESARTAN; HYDROCHLOROTHIAZIDE,2,"[IFIRMACOMBI GE 300/25MG CPR 90, IRBÉSARTAN + ...",True,0.191489,True,0.095745
1159,PEGINTERFERON BETA 1A,2,[PEGINTERFÉRON BÊTA 1A 125 MICROGRAMMES SOLUTI...,True,0.195122,True,0.097561
589,DOXORUBICINE,2,"[CAELYX SOL À DILUER P PERF IV 2 MG/ML, DOXORU...",True,0.210526,True,0.105263
1203,POLYOSIDE CAPSULAIRE VI DE SALMONELLA TYPHI; V...,2,"[TYAVAX SER 1ML 1, VACCIN FIÈVRE TYPHOÏDE + HÉ...",True,0.212766,True,0.106383
1061,MYCOPHENOLATE,3,[ACIDE MYCOPHÉNOLIQUE 360 MG COMPRIMÉ GASTRORÉ...,True,0.222222,True,0.074074
1350,SONIDEGIB,2,"[ODOMZO 200MG GELULE 30X1, SONIDEGIB 200 MG GÉ...",True,0.222222,True,0.111111
544,DICHLORHYDRATE DE ZUCLOPENTHIXOL,2,"[CLOPIXOL 2% SOL BUV 20ML, ZUCLOPENTHIXOL 2 % ...",True,0.238806,True,0.119403
115,ALPROSTADIL,5,[ALPROSTADIL 20 ΜG/1 ML POUDRE ET SOLVANT POUR...,True,0.242353,True,0.048471


In [57]:
analysis[analysis["incoherent"]==True]

Unnamed: 0,DCI_MERE,nb_medicaments,liste_prelibt,divergence,similarite_moyenne,incoherent,score_coherence
0,ABACAVIR,14,"[ABACAVIR 20 MG/ML SOLUTION BUVABLE, ABACAVIR ...",True,0.433995,True,0.031000
1,ABACAVIR ; LAMIVUDINE,8,[ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ ...,True,0.645076,True,0.080635
2,ABACAVIR; LAMIVUDINE,14,[ABACAVIR + LAMIVUDINE 600 MG/300 MG COMPRIMÉ ...,True,0.453669,True,0.032405
3,ABIRATERONE,3,"[ABIRATÉRONE 500 MG COMPRIMÉ, ZYTIGA 250MG CPR...",True,0.495346,True,0.165115
4,ACAMPROSATE,6,[ACAMPROSATE 333 MG COMPRIMÉ ENROBÉ GASTRORÉSI...,True,0.482199,True,0.080367
...,...,...,...,...,...,...,...
1491,ZIDOVUDINE,13,"[RETROVIR 100MG GELULE 100, RETROVIR 100MG GEL...",True,0.564254,True,0.043404
1494,ZOLMITRIPTAN,14,"[ZOLMITRIPTAN 2,5 MG COMPRIMÉ ORODISPERSIBLE, ...",True,0.558415,True,0.039887
1495,ZOLPIDEM,34,[STILNOX 10 MG COMPRIMÉ PELLICULÉ SÉCABLE BOÎT...,True,0.598043,True,0.017589
1497,ZOPICLONE,50,"[IMOVANE 3,75 MG COMPRIMÉ PELLICULÉ BOÎTE DE 1...",True,0.593918,True,0.011878


In [58]:
analysis.to_excel("analysis.xlsx")

# 111 

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,DCI
0,0,CHLORHYDRATE DE VALACICLOVIR
1,1,FUMARATE DE TENOFOVIR ALAFENAMIDE
2,2,FUMARATE DE TENOFOVIR ALAFENAMIDE
3,3,CHLORHYDRATE DE RILPIVIRINE
4,4,CLAVULANATE DE POTASSIUM


In [23]:
import unidecode
df["DCI"] = (
df["DCI"]
    .astype(str)
    .apply(lambda x: unidecode.unidecode(x))  # enlever les accents
    .str.upper()                              # majuscules
    .str.strip()                              # trim
)


In [24]:
df.shape

(703, 2)

In [25]:
df["DCI"].unique().shape

(458,)

# en code

In [6]:
df.shape

(520, 14)

In [3]:
df_d  = df[["DCI"]]

In [9]:
df_d = (
    df_d
    .assign(DCI=df_d["DCI"].str.split(";"))
    .explode("DCI")
)

In [10]:
df_d.shape

(1073, 2)

In [13]:
df_d=df_d.drop("cip7",axis=1)

In [14]:
df_d

Unnamed: 0,DCI
0,CHLORHYDRATE DE VALACICLOVIR
1,COBICISTAT
1,ELVITEGRAVIR
1,EMTRICITABINE
1,FUMARATE DE TENOFOVIR ALAFENAMIDE
...,...
518,AUBEPINE
518,BROMURE DE POTASSIUM
518,BROMURE DE SODIUM
518,PASSIFLORE


In [15]:
df_d=df_d[df_d["DCI"].str.contains(r"\bDE\b", na=False)]

In [18]:
df_d.drop_duplicates(subset=["DCI"])

Unnamed: 0,DCI
0,CHLORHYDRATE DE VALACICLOVIR
1,FUMARATE DE TENOFOVIR ALAFENAMIDE
3,CHLORHYDRATE DE RILPIVIRINE
4,CLAVULANATE DE POTASSIUM
5,FUMARATE DE TENOFOVIR DISOPROXIL
...,...
516,IODURE DE POTASSIUM
516,MOLYBDATE DE SODIUM DIHYDRATE
518,BROMURE DE POTASSIUM
518,BROMURE DE SODIUM


In [19]:
df_d.to_excel("data/out/df_de_dci.xlsx")