In [6]:
import numpy as np 
import pandas as pd

# ____________________

In [8]:
data =pd.read_excel('data/ARV.xlsx')

In [10]:
data

Unnamed: 0,ARV
0,Combivir + Kalétra
1,Isentress + Truvada
2,Isentress + Kivexa
3,Triumeq
4,Genvoya
...,...
19215,Fuzeon + Trizivir + Viread + TMC 114
19216,Norvir + Truvada + Ziagen + TMC 114 (Darunavir...
19217,Norvir + Prezista + Truvada + Ziagen + TMC 125...
19218,Norvir + Prezista + Truvada + Ziagen + Maravir...


In [12]:
df=data.copy() 

In [26]:
# ---- 2. Nettoyage de base : enlever espaces inutiles ----
df["ARV"] = df["ARV"].str.strip()

In [28]:
# ---- 3. Séparer les médicaments par "+" ----
df["drugs_list"] = df["ARV"].str.split("\+")

In [30]:
# ---- 4. Nettoyer chaque nom (trim, minuscules, enlever espaces doubles) ----
df["drugs_list"] = df["drugs_list"].apply(lambda lst: [d.strip() for d in lst])

In [32]:
# ---- 5. Exploser les lignes pour avoir une liste à plat ----
all_drugs = df["drugs_list"].explode()

In [34]:
# ---- 6. Nombre total de médicaments (après séparation) ----
total_meds = len(all_drugs)

In [36]:
total_meds

76156

In [38]:
# ---- 7. Nombre de médicaments uniques ----
unique_meds = all_drugs.nunique()

In [40]:
unique_meds

248

In [42]:
# ---- 8. Fréquence des médicaments ----
drug_freq = all_drugs.value_counts()
drug_pct = all_drugs.value_counts(normalize=True) * 100

drug_freq_df = pd.DataFrame({
    "frequence": drug_freq,
    "pourcentage": drug_pct.round(2)
})

In [46]:
drug_freq_df[:20]

Unnamed: 0_level_0,frequence,pourcentage
drugs_list,Unnamed: 1_level_1,Unnamed: 2_level_1
Norvir,7783,10.22
Videx,4512,5.92
Epivir,4339,5.7
Viread,3938,5.17
Ziagen,3537,4.64
Kalétra,3334,4.38
Zerit,2511,3.3
Sustiva,2206,2.9
Invirase,2116,2.78
Fuzeon,2015,2.65


In [50]:
# ---- 9. Fréquence des combinaisons EXACTES ----
combination_freq = df["ARV"].value_counts().reset_index()
combination_freq.columns = ["ARV", "frequence"]

In [52]:
combination_freq

Unnamed: 0,ARV,frequence
0,Combivir + Kalétra,1
1,Intelence + Isentress + Norvir + Telzir + Téno...,1
2,Epivir + Fuzeon + Kalétra + Viracept + Ziagen ...,1
3,Epivir + Norvir + Zerit + Ziagen + Tipranavir,1
4,Agenerase + Epivir + Kalétra + Viracept + Zerit,1
...,...,...
19215,Celsentri + Isentress + Norvir + Telzir,1
19216,Norvir + Reyataz + Telzir + Ténofovir (Viread),1
19217,Agenerase + Emtricitabine / Ténofovir (Truvada...,1
19218,Lamivudine (Epivir) + Raltégravir (Isentress),1


In [14]:
mot = "Zerit"

mask = df.astype(str).apply(lambda col: col.str.contains(mot, case=False, na=False))
resultat = df[mask.any(axis=1)]

resultat


Unnamed: 0,ARV
10,Zerit + Epivir + Viracept
12,Epivir + Viracept + Zerit
20,Epivir + Invirase + Zerit
45,Crixivan + Norvir + Videx + Zerit
63,ZERIT + VIDEX
...,...
19187,Crixivan + Epivir + Norvir + Sustiva + Videx +...
19188,Epivir + Invirase + Norvir + Sustiva + Videx +...
19192,Epivir + Hivid + Videx + Viracept + Zerit
19200,Epivir + Fuzeon + Viramune + Zerit


In [119]:
meds_with_parenthesis = [med for med in all_meds if '(' in df]
meds_with_parenthesis

['etravirine (intelence)',
 'tld (tdf/3td/dtg)',
 'lamivudine / tenofovir disoproxil / doravirine (delstrigo)',
 'tmc 278 (rilpivirine)',
 'vocabria (cabotegravir)',
 'emtricitabine / tenofovir disoproxil (truvada)',
 'tenofovir (viread)',
 'abacavir / lamivudine / dolutegravir (triumeq)',
 'enfurvirtide (fuzeon)',
 'lopinavir / ritonavir (kaletra)',
 'lamivudine / dolutegravir (dovato)',
 'lenacapavir (sunlenca)',
 'gsk1265744 la (cabotegravir)',
 'tenofovir disoproxil (viread)',
 'zidovudine (retrovir)',
 'tenofovir disoproxil / lamivudine / dolutegravir (tld)',
 'nelfinavir (viracept)',
 'raltegravir (isentress)',
 'ibalizumab (trogarzo)',
 'emtricitabine / tenofovir disoproxil / elvitegravir / cobicistat (stribild)',
 'rilpivirine (rekambys)',
 'abacavir / lamivudine (kivexa)',
 'abacavir (ziagen)',
 'quad taf ( gs 9350)',
 'atazanavir (reyataz)',
 'ritonavir (norvir)',
 'tmc278 la (rilpivirine)',
 'emtricitabine / tenofovir (truvada)',
 'saquinavir (fortovase)',
 'emtricitabine / 

In [121]:
len(meds_with_parenthesis)

81