In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
import zipfile

In [2]:
data = pd.read_csv("../data/OPEN_MEDIC_2019.zip", header=0, sep=';', encoding="latin_1")
#remove labels of drugs (just keep the codes)
try:
    del(data['l_ATC1'])
    del(data['L_ATC2'])
    del(data['L_ATC3'])
    del(data['L_ATC4'])
    del(data['L_ATC5'])
    del(data['l_cip13'])
except:
    pass

In [3]:
data.head()

Unnamed: 0,ATC1,ATC2,ATC3,ATC4,ATC5,CIP13,TOP_GEN,GEN_NUM,age,sexe,BEN_REG,PSP_SPE,BOITES,REM,BSE
0,A,A01,A01A,A01AA,A01AA01,3400931911999,0,0,0,1,5,1,113,5331,19153
1,A,A01,A01A,A01AA,A01AA01,3400931911999,0,0,0,1,5,12,152,7760,25868
2,A,A01,A01A,A01AA,A01AA01,3400931911999,0,0,0,1,5,90,74,3350,12672
3,A,A01,A01A,A01AA,A01AA01,3400931911999,0,0,0,1,5,98,13,656,2187
4,A,A01,A01A,A01AA,A01AA01,3400931911999,0,0,0,1,11,1,106,3947,13674


In [4]:
try:
    del(data['ATC1'])
    del(data['ATC2'])
    del(data['ATC3'])
    del(data['ATC4'])
    del(data['TOP_GEN'])
    del(data['GEN_NUM'])
    del(data['REM'])
    del(data['BSE'])
except:
    pass


## The following lines keep only the meaningful dimensions
drugs=data.rename(columns={"sexe":"sex", "BEN_REG":"RR"}).groupby(["age","sex","RR","ATC5","CIP13"]).agg({"BOITES":["sum"]}) #compute a group object
drugs = drugs.reset_index() #transform the group object into a dataframe
drugs.columns = drugs.columns.get_level_values(0)

In [5]:
## Drugcounts evaluates the total number of deliveries per age and sex (what ever the drug)

# Remove negative counts
drugs=drugs[drugs['BOITES']>0]

#Evaluated the totals of drug deliveries per age and per sex
drugcounts=drugs.groupby(["age","RR","sex"]).agg({"BOITES":["sum"]})
drugcounts = drugcounts.reset_index() #transform the group object into a dataframe
drugcounts.columns = drugcounts.columns.get_level_values(0)
drugcounts.rename( columns={'BOITES':'count'}, inplace=True)

## Relative drug deliveries frequencies

In [6]:
# We now compute the frequency of the deliveries for each drug per group of sex and age
drug_freq=pd.merge(drugs,drugcounts,how='left', on=['age','RR','sex'])
drug_freq['p']=drug_freq['BOITES']/drug_freq['count']
drug_freq

Unnamed: 0,age,sex,RR,ATC5,CIP13,BOITES,count,p
0,0,1,0,A02BC05,3400938673081,32,5823,0.005495
1,0,1,0,A02X,3400931923077,31,5823,0.005324
2,0,1,0,A03AA05,3400934104831,37,5823,0.006354
3,0,1,0,A03AX12,3400930986080,13,5823,0.002233
4,0,1,0,A03AX12,3400931863014,51,5823,0.008758
...,...,...,...,...,...,...,...,...
560603,99,9,99,V08CA04,3400935082886,14,193624,0.000072
560604,99,9,99,V08CA04,3400935082947,11,193624,0.000057
560605,99,9,99,V08CA08,3400938879728,29,193624,0.000150
560606,99,9,99,V08CA09,3400936080218,16,193624,0.000083


In [7]:
#sauvegarde de la matrice pour réutilisation
drug_freq[['age','sex','ATC5','RR','CIP13','p']].to_csv("drugs_freq.csv")

## Mean number of deliveries per patients

In [8]:
#0 et 99 : inconnu
#5 : régions et dpt outermer
#93 : PACA et corse
#99 : ??
drugs['RR'].unique()

array([ 0,  5, 11, 24, 27, 28, 32, 44, 52, 53, 75, 76, 84, 93, 99])

In [9]:
#41, 43, 91, 21 et 22: (région fictive) population vide
#93: PACA
#94: corse
pop_saq = pd.read_csv("pop.csv")
pop_saq['RR'].unique()


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


array([84, 32, 22, 93, 44, 21, 76, 28, 75, 24, 94, 27, 53, 43, 91, 52, 41,
       11,  1,  2,  3,  4])

In [10]:
mapping = {84:84, 32:32, 93:93, 44:44, 76:76, 28:28, 75:75, 24:24, 94:93, 27:27, 53:53, 52:52, 11:11,  1:0,  2:0,  3:0,  4:0}

In [11]:
regions = [84, 32, 93, 44, 76, 28, 75, 24, 94, 27, 53, 52, 11,  1,  2,  3,  4]
sexes = [1,2]
ages=[(0,20), (20,60),(60,95),(95,150) ]

index = pd.MultiIndex.from_product([ages, regions, sexes], names = ["age", "RR", 'sex'])
mean_deliveries=pd.DataFrame(index = index)
mean_deliveries['mean']=float(0)

for r in regions:
    for s in sexes:
        for age in ages:
            if r==93 or r==94:
                #sum populations of 93 + 94
                cpop=np.sum(pop_saq[ ((pop_saq["RR"]==93) |(pop_saq["RR"]==94) ) & (pop_saq["age"]>=age[0]) & (pop_saq["age"]< age[1]) & (pop_saq['sex']==s) ]['pop'])
                rd=93
            elif r==1 or r==2 or r==3 or r==4:
                #sum populations of 1+2+3+4
                cpop=np.sum(pop_saq[ ((pop_saq["RR"]==1) |(pop_saq["RR"]==2)|(pop_saq["RR"]==3) |(pop_saq["RR"]==4) ) & (pop_saq["age"]>=age[0]) & (pop_saq["age"]< age[1]) & (pop_saq['sex']==s) ]['pop'])
                rd=0
            else:
                cpop=np.sum(pop_saq[ (pop_saq["RR"]==r) & (pop_saq["age"]>=age[0]) & (pop_saq["age"]< age[1]) & (pop_saq['sex']==s) ]['pop'])
                rd=r
            if age[0]==95:
                a=99
            else:
                a=age[0]
            if cpop!=0:
                mean_deliveries.loc[age,r,s]['mean']=np.sum(drug_freq[ (drug_freq['age']==a) & (drug_freq['sex']==s) & (drug_freq['RR']==rd) ]['BOITES'])/cpop
mean_deliveries.reset_index(inplace=True)
mean_deliveries['age']=mean_deliveries['age'].apply(lambda x: x[0])

In [12]:
mean_deliveries.to_csv("mean_deliveries.csv")