In [29]:
import pandas as pd
import re
from difflib import get_close_matches

NORMALISATION DES TRAITEMENTS VIH

In [30]:
# ------------------------------------------------------------------
# 1. Charger les fichiers
# ------------------------------------------------------------------
df_ref = pd.read_excel("resul_123456789.xlsx")
df_arv = pd.read_excel("data/arv.xlsx")

print(f"‚Üí {len(df_ref)} m√©dicaments dans le dictionnaire VIH")
print(f"‚Üí {len(df_arv)} lignes ARV √† traiter")


‚Üí 207 m√©dicaments dans le dictionnaire VIH
‚Üí 19220 lignes ARV √† traiter


In [31]:
# ------------------------------------------------------------------
# 2. Pr√©paration du dictionnaire VIH
# ------------------------------------------------------------------
df_ref["nom_commercial"] = df_ref["Med"].fillna("").astype(str)
df_ref["dci"] = df_ref["DCI"].fillna("").astype(str)


In [32]:
# Liste des m√©dicaments (nom commercial + nom de base + dci)
medicaments = []

for _, row in df_ref.iterrows():
    nom = row["nom_commercial"].strip()
    dci = row["dci"].strip()

    if nom and dci:
        nom_base = re.split(r'\s+\d|,|\s+\(', nom)[0].strip().upper()
        medicaments.append({
            "nom_complet": nom,
            "nom_base": nom_base,
            "nom_base_lower": nom_base.lower(),
            "dci": dci
        })

In [33]:
# Set des DCI pour recherche rapide
dci_set = set()
for d in df_ref["dci"]:
    if d:
        dci_set.add(d.lower())
        for comp in re.split(r"[\+/]", d):
            comp = comp.strip()
            if comp:
                dci_set.add(comp.lower())

print(f"Dictionnaire charg√© : {len(medicaments)} m√©dicaments")

Dictionnaire charg√© : 206 m√©dicaments


In [34]:
# ------------------------------------------------------------------
# 3. Fonctions utilitaires simples
# ------------------------------------------------------------------

def extraire_nom_commercial(txt):
    m = re.search(r"\(([^)]+)\)", txt)
    return m.group(1).strip() if m else None

def enlever_parentheses(txt):
    return re.sub(r"\s*\([^)]*\)", "", txt).strip()

def chercher_dci(med_str):
    if not med_str or not med_str.strip():
        return "VIDE"

    med_str = med_str.strip()

    # 1. regarder parenth√®ses ‚Üí nom commercial
    nom_com = extraire_nom_commercial(med_str)
    if nom_com:
        cle = nom_com.strip().upper()
    else:
        cle = enlever_parentheses(med_str).upper()

    cle_lower = cle.lower()

    # 2. correspondance exacte sur nom de base
    for med in medicaments:
        if med["nom_base"] == cle:
            return med["dci"]

    # 3. correspondance partielle
    for med in medicaments:
        if med["nom_base"].startswith(cle) or cle.startswith(med["nom_base"]):
            return med["dci"]

    # 4. recherche dans nom complet
    for med in medicaments:
        if cle in med["nom_complet"].upper():
            return med["dci"]

    # 5. matching approx
    bases = [m["nom_base_lower"] for m in medicaments]
    match = get_close_matches(cle_lower, bases, n=1, cutoff=0.8)
    if match:
        for m in medicaments:
            if m["nom_base_lower"] == match[0]:
                return m["dci"]

    # 6. d√©j√† une DCI ?
    if cle_lower in dci_set:
        for m in medicaments:
            if cle_lower in m["dci"].lower():
                return m["dci"]

    # 7 matching approx dans DCI
    match = get_close_matches(cle_lower, list(dci_set), n=1, cutoff=0.9)
    if match:
        for m in medicaments:
            if match[0] in m["dci"].lower():
                return m["dci"]

    return f"INTROUVABLE ({cle})"


In [35]:
# ------------------------------------------------------------------
# 4. Normalisation du traitement ARV
# ------------------------------------------------------------------

def normaliser_traitement(traitement):
    if pd.isna(traitement) or not str(traitement).strip():
        return "VIDE"

    items = [t.strip() for t in str(traitement).split("+")]

    dcis = []
    for item in items:
        if item:
            d = chercher_dci(item)
            d = d.replace(" + ", " / ")
            dcis.append(d)

    return " + ".join(dcis)

In [36]:
# ------------------------------------------------------------------
# 5. Appliquer sur tout le DataFrame
# ------------------------------------------------------------------

print("\nTraitement en cours...")
df_arv["DCI"] = df_arv["ARV"].apply(normaliser_traitement)

nb_introuv = df_arv["DCI"].str.contains("INTROUVABLE", na=False).sum()
nb_vides = (df_arv["DCI"] == "VIDE").sum()

print(f"‚úì Normalisation termin√©e")
print(f"  ‚Üí Trouv√©s : {len(df_arv) - nb_introuv - nb_vides}")
print(f"  ‚Üí Introuvables : {nb_introuv}")
print(f"  ‚Üí Vides : {nb_vides}")



Traitement en cours...
‚úì Normalisation termin√©e
  ‚Üí Trouv√©s : 19119
  ‚Üí Introuvables : 101
  ‚Üí Vides : 0


In [37]:
# ------------------------------------------------------------------
# 6. Sauvegarde
# ------------------------------------------------------------------
df_arv.to_excel("data/out/arv_dci1.xlsx", index=False)
print("\nFichier cr√©√© : data/out/arv_dci.xlsx")
print("="*60)



Fichier cr√©√© : data/out/arv_dci.xlsx


# ATC

In [47]:
print("Lecture des fichiers...")
df_arv = pd.read_excel("data/out/arv_dci1.xlsx")
df_vih = pd.read_excel("resul_123456789.xlsx")

print(f"   ‚Üí {len(df_arv)} lignes ARV")
print(f"   ‚Üí {len(df_vih)} lignes VIH")

Lecture des fichiers...
   ‚Üí 19220 lignes ARV
   ‚Üí 207 lignes VIH


In [48]:
# ------------------------------------------------------------
# 1) NORMALISER LES DCI
# ------------------------------------------------------------
def norm(x):
    return " ".join(str(x).lower().strip().split())


In [49]:
# ------------------------------------------------------------
# 2) CR√âATION DU MAPPING DCI ‚Üí ATC
# ------------------------------------------------------------

mapping = {}

for _, row in df_vih.iterrows():
    dci = str(row["DCI"])
    atc = str(row["ATC"])

    # DCI compl√®te
    dci_norm = norm(dci)
    mapping[dci_norm] = atc

    # Si combinaison (lamivudine + zidovudine)
    if "+" in dci:
        parts = [norm(p) for p in dci.split("+")]
        for p in parts:
            if p not in mapping:
                mapping[p] = atc

    # Si "/" dans le fichier VIH
    if "/" in dci:
        dci_slash = norm(dci.replace("/", "+"))
        mapping[dci_slash] = atc

print(f"üîó Mapping cr√©√© : {len(mapping)} cl√©s DCI")


üîó Mapping cr√©√© : 109 cl√©s DCI


In [50]:
# ------------------------------------------------------------
# 3) G√âN√âRATION DE LA COLONNE ATC
# ------------------------------------------------------------

ATC_output = []

for _, row in df_arv.iterrows():
    dci_full = str(row["DCI"])

    # S√©paration des m√©dicaments d‚Äôun traitement
    meds = [m.strip() for m in dci_full.split("+")]

    atc_codes = []

    for med in meds:

        med_norm = norm(med)

        # 1) recherche directe
        if med_norm in mapping:
            atc_codes.append(mapping[med_norm])
            continue

        # 2) si "/" dans la DCI normalis√©e
        if "/" in med:
            med_plus = med_norm.replace("/", "+")
            if med_plus in mapping:
                atc_codes.append(mapping[med_plus])
                continue

        # 3) essayer permutations pour combinaisons
        if "/" in med or "+" in med:

            # extraire composants
            parts = re.split(r"[+/]", med)
            parts = [norm(p) for p in parts]

            combo1 = " + ".join(parts)
            combo2 = " + ".join(parts[::-1])

            if combo1 in mapping:
                atc_codes.append(mapping[combo1])
                continue
            if combo2 in mapping:
                atc_codes.append(mapping[combo2])
                continue

        # 4) rien trouv√©
        # print(f"‚ö†Ô∏è ATC non trouv√© pour: {med}")
        atc_codes.append("NON_TROUVE")

    ATC_output.append(" + ".join(atc_codes))



In [51]:

# ------------------------------------------------------------
# 4) AJOUTER LA COLONNE ET SAUVEGARDER
# ------------------------------------------------------------

df_arv["ATC"] = ATC_output

out = "data/out/arv_dci_atc001.xlsx"
df_arv.to_excel(out, index=False)

print("\n Fichier g√©n√©r√© :", out)
print("\n APER√áU :")
print(df_arv[["ARV", "DCI", "ATC"]].head(10).to_string())

# Statistique
nb_missing = df_arv["ATC"].str.contains("NON_TROUVE").sum()
print("\n ATC NON TROUV√âS :", nb_missing)



 Fichier g√©n√©r√© : data/out/arv_dci_atc001.xlsx

 APER√áU :
                                   ARV                                                             DCI                          ATC
0                   Combivir + Kal√©tra                   LAMIVUDINE; ZIDOVUDINE + LOPINAVIR; RITONAVIR            J05AR01 + J05AR10
1                  Isentress + Truvada               RALTEGRAVIR + EMTRICITABINE; TENOFOVIR DISOPROXIL            J05AJ01 + J05AR03
2                   Isentress + Kivexa                             RALTEGRAVIR + ABACAVIR ; LAMIVUDINE            J05AJ01 + J05AR02
3                              Triumeq                             DOLUTEGRAVIR ; LAMIVUDINE; ABACAVIR                      J05AR13
4                              Genvoya  COBICISTAT; ELVITEGRAVIR; EMTRICITABINE; TENOFOVIR ALAFENAMIDE                      J05AR18
5                             Biktarvy              BICTEGRAVIR ; EMTRICITABINE; TENOFOVIR ALAFENAMIDE                      J05AR20
6   Lamivudi

In [53]:
import pandas as pd
import numpy as np

# Charger le fichier
df = pd.read_excel("data/out/arv_dci_atc001.xlsx")

# Remplacer NaN par des cha√Ænes vides pour √©viter l'erreur split
df[["ARV", "DCI", "ATC"]] = df[["ARV", "DCI", "ATC"]].fillna("")

# Trouver le nombre max de m√©dicaments dans tout le fichier
df["n_meds"] = df["ARV"].apply(lambda x: len([m.strip() for m in str(x).split("+")]))
max_meds = df["n_meds"].max()

# Fonction de transformation
def process_row(row):
    arv_list = [x.strip() for x in str(row["ARV"]).split("+") if x.strip() != ""]
    dci_list = [x.strip() for x in str(row["DCI"]).split("+") if x.strip() != ""]
    atc_list = [x.strip() for x in str(row["ATC"]).split("+") if x.strip() != ""]

    data = {}

    for i in range(max_meds):
        data[f"Med_{i+1}"]      = arv_list[i] if i < len(arv_list) else np.nan
        data[f"DCI_Med{i+1}"]   = dci_list[i] if i < len(dci_list) else np.nan
        data[f"ATC_Med{i+1}"]   = atc_list[i] if i < len(atc_list) else np.nan

    return pd.Series(data)

# Appliquer √† toutes les lignes
new_cols = df.apply(process_row, axis=1)

# Fusionner
df_final = pd.concat([df, new_cols], axis=1)

df_final.head()


Unnamed: 0,ARV,DCI,ATC,n_meds,Med_1,DCI_Med1,ATC_Med1,Med_2,DCI_Med2,ATC_Med2,...,ATC_Med8,Med_9,DCI_Med9,ATC_Med9,Med_10,DCI_Med10,ATC_Med10,Med_11,DCI_Med11,ATC_Med11
0,Combivir + Kal√©tra,LAMIVUDINE; ZIDOVUDINE + LOPINAVIR; RITONAVIR,J05AR01 + J05AR10,2,Combivir,LAMIVUDINE; ZIDOVUDINE,J05AR01,Kal√©tra,LOPINAVIR; RITONAVIR,J05AR10,...,,,,,,,,,,
1,Isentress + Truvada,RALTEGRAVIR + EMTRICITABINE; TENOFOVIR DISOPROXIL,J05AJ01 + J05AR03,2,Isentress,RALTEGRAVIR,J05AJ01,Truvada,EMTRICITABINE; TENOFOVIR DISOPROXIL,J05AR03,...,,,,,,,,,,
2,Isentress + Kivexa,RALTEGRAVIR + ABACAVIR ; LAMIVUDINE,J05AJ01 + J05AR02,2,Isentress,RALTEGRAVIR,J05AJ01,Kivexa,ABACAVIR ; LAMIVUDINE,J05AR02,...,,,,,,,,,,
3,Triumeq,DOLUTEGRAVIR ; LAMIVUDINE; ABACAVIR,J05AR13,1,Triumeq,DOLUTEGRAVIR ; LAMIVUDINE; ABACAVIR,J05AR13,,,,...,,,,,,,,,,
4,Genvoya,COBICISTAT; ELVITEGRAVIR; EMTRICITABINE; TENOF...,J05AR18,1,Genvoya,COBICISTAT; ELVITEGRAVIR; EMTRICITABINE; TENOF...,J05AR18,,,,...,,,,,,,,,,


In [56]:
# Sauvegarder
df_clean.to_excel("data/out/arv.xlsx", index=False)

In [54]:
import pandas as pd
import unicodedata

def remove_accents(text):
    """
    Supprime les accents d'une cha√Æne de caract√®res
    """
    if pd.isna(text):
        return text
    if not isinstance(text, str):
        return text
    # Normalise la cha√Æne en d√©composant les caract√®res accentu√©s
    nfkd_form = unicodedata.normalize('NFKD', text)
    # Supprime les accents en ne gardant que les caract√®res ASCII
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

def clean_text(text):
    """
    Nettoie le texte: supprime les accents, les espaces multiples, et met en majuscules
    """
    if pd.isna(text):
        return text
    if not isinstance(text, str):
        return text
    
    # Supprime les accents
    text = remove_accents(text)
    # Supprime les espaces multiples et les espaces en d√©but/fin
    text = ' '.join(text.split())
    # Met en majuscules
    text = text.upper()
    
    return text

# Applique le nettoyage √† toutes les colonnes sauf la premi√®re
def clean_dataframe(df):
    """
    Nettoie toutes les colonnes du dataframe sauf la premi√®re
    """
    df_clean = df.copy()
    
    # R√©cup√®re le nom de la premi√®re colonne
    first_column = df.columns[0]
    
    # Applique le nettoyage √† toutes les colonnes sauf la premi√®re
    for col in df.columns[1:]:
        df_clean[col] = df_clean[col].apply(clean_text)
    
    return df_clean

# Exemple d'utilisation:
# df_clean = clean_dataframe(df)
# print(df_clean.head())

# Si vous voulez voir les diff√©rences avant/apr√®s:
# print("Avant:")
# print(df.iloc[0, 1:5])
# print("\nApr√®s:")
# print(df_clean.iloc[0, 1:5])

In [55]:
# Exemple d'utilisation:
df_clean = clean_dataframe(df_final)
df_clean.head()

Unnamed: 0,ARV,DCI,ATC,n_meds,Med_1,DCI_Med1,ATC_Med1,Med_2,DCI_Med2,ATC_Med2,...,ATC_Med8,Med_9,DCI_Med9,ATC_Med9,Med_10,DCI_Med10,ATC_Med10,Med_11,DCI_Med11,ATC_Med11
0,Combivir + Kal√©tra,LAMIVUDINE; ZIDOVUDINE + LOPINAVIR; RITONAVIR,J05AR01 + J05AR10,2,COMBIVIR,LAMIVUDINE; ZIDOVUDINE,J05AR01,KALETRA,LOPINAVIR; RITONAVIR,J05AR10,...,,,,,,,,,,
1,Isentress + Truvada,RALTEGRAVIR + EMTRICITABINE; TENOFOVIR DISOPROXIL,J05AJ01 + J05AR03,2,ISENTRESS,RALTEGRAVIR,J05AJ01,TRUVADA,EMTRICITABINE; TENOFOVIR DISOPROXIL,J05AR03,...,,,,,,,,,,
2,Isentress + Kivexa,RALTEGRAVIR + ABACAVIR ; LAMIVUDINE,J05AJ01 + J05AR02,2,ISENTRESS,RALTEGRAVIR,J05AJ01,KIVEXA,ABACAVIR ; LAMIVUDINE,J05AR02,...,,,,,,,,,,
3,Triumeq,DOLUTEGRAVIR ; LAMIVUDINE; ABACAVIR,J05AR13,1,TRIUMEQ,DOLUTEGRAVIR ; LAMIVUDINE; ABACAVIR,J05AR13,,,,...,,,,,,,,,,
4,Genvoya,COBICISTAT; ELVITEGRAVIR; EMTRICITABINE; TENOF...,J05AR18,1,GENVOYA,COBICISTAT; ELVITEGRAVIR; EMTRICITABINE; TENOF...,J05AR18,,,,...,,,,,,,,,,
