In [1]:
import pandas as pd
import numpy as np

## Load population information

In [5]:
pop_saq=pd.read_csv("pop.csv", dtype={"dpt":'str'})

pop_tot = np.sum(pop_saq["pop"])
print("Population totale: ", pop_tot)

print("population per Age, sex and dpt")
pop_ASD=pop_saq.groupby(["age","sex",'dpt']).agg({"pop":"sum"})
pop_ASD= pop_ASD.reset_index() #transform the group object into a dataframe
pop_ASD.columns = pop_ASD.columns.get_level_values(0)
pop_ASD['pop']=pop_ASD['pop']/pop_tot
pop_ASD['sex']=pop_ASD['sex'].astype("int64")
pop_ASD.head()

Population totale:  66359521.99452544
population per Age, sex and dpt


Unnamed: 0,age,sex,dpt,pop
0,0,1,1,0.000309
1,0,1,2,0.000241
2,0,1,3,0.000117
3,0,1,4,6.1e-05
4,0,1,5,5.6e-05


## Load ALD data

In [6]:
#counts of ALD per departement
ald_per_dpt=pd.read_excel('../data/count_ALD.xls', sheet_name='dpt')
ald_per_dpt=pd.melt(ald_per_dpt,id_vars=['dpt'])
ald_per_dpt.rename(columns={'variable':'ALD'},inplace=True)

In [8]:
#estimate the incidence of each ALD in the general population
ald=ald_per_dpt.groupby(["ALD"]).agg({"value":["sum"]})
ald= ald.reset_index()
ald.columns = ald.columns.get_level_values(0)
ald.set_index("ALD", inplace=True)
ald.head()

ald['p']=ald['value']/pop_tot
ald.reset_index(inplace=True)
ald.head()

Unnamed: 0,ALD,value,p
0,ALD1,428490,0.006457
1,ALD10,18645,0.000281
2,ALD11,37415,0.000564
3,ALD12,565260,0.008518
4,ALD13,1170940,0.017645


In [10]:
#incidence of each ALD in each dpt
pop_D=pop_ASD.groupby(['dpt']).agg({"pop":"sum"})
pop_D= pop_D.reset_index()
pop_D.columns = pop_D.columns.get_level_values(0)
pALD_knowing_dpt = pd.merge(ald_per_dpt,pop_D,how="inner",on="dpt")
pALD_knowing_dpt['p'] = pALD_knowing_dpt['value']/pALD_knowing_dpt['pop']
pALD_knowing_dpt = pALD_knowing_dpt[['dpt','ALD','p']]
pALD_knowing_dpt.head()

Unnamed: 0,dpt,ALD,p
0,1,ALD1,359621.272119
1,1,ALD2,19747.988931
2,1,ALD3,409510.928367
3,1,ALD4,519.683919
4,1,ALD5,854360.363242


In [12]:
#counts of ALD per sex/age
ald_per_sexage=pd.read_excel('../data/count_ALD.xls', sheet_name='sexe-age')
ald_per_sexage=pd.melt(ald_per_sexage,id_vars=['Ald','Sexe'])
ald_per_sexage.rename(columns={'Ald':'ALD', 'variable':"age", 'Sexe':'sex'},inplace=True)
ald_per_sexage['age']=ald_per_sexage['age'].astype("int64")

In [13]:
#incidence of each ALD for each sex/age
pop_AS=pop_ASD.groupby(['sex','age']).agg({"pop":"sum"})
pop_AS= pop_AS.reset_index()
pop_AS.columns = pop_AS.columns.get_level_values(0)
#pop_AS['sex']=pop_AS['sex'].astype("int64")
pALD_knowing_sexage = pd.merge(ald_per_sexage, pop_AS, how="inner", on=["sex","age"])
pALD_knowing_sexage['p'] = pALD_knowing_sexage['value']/pALD_knowing_sexage['pop']
pALD_knowing_sexage = pALD_knowing_sexage[['sex','age','ALD','p']]
pALD_knowing_sexage.head()

Unnamed: 0,sex,age,ALD,p
0,1,0,ALD1,24252.294626
1,1,0,ALD2,3368.374254
2,1,0,ALD3,1684.187127
3,1,0,ALD4,168.418713
4,1,0,ALD5,149555.81686


## Estimation de $p(ALD|DS)$

We want to estimate probability of $ALD$ knowning the age, sex and departement.

we have that 
$p(ALD|D,S,A) = \frac{p(D,S,A|ALD)\times p(ALD)}{p(D,S,A)}$

then, to estimate $p(D,S,A|ALD)$, we have to rebuild this joint distribution from the marginal distribution $p(D|ALD)$ and $p(S,A|ALD)$.

### Estimation of $p(D|ALD) = \frac{p(D,ALD)}{p(ALD)}$

In [14]:
p_d_ald=pd.merge(ald_per_dpt,ald, how="inner", on='ALD')
p_d_ald['p']=p_d_ald['value_x']/p_d_ald['value_y']
p_d_ald=p_d_ald[['dpt','ALD','p']]

In [15]:
p_d_ald.head()

Unnamed: 0,dpt,ALD,p
0,1,ALD1,0.008075
1,2,ALD1,0.007538
2,3,ALD1,0.007188
3,4,ALD1,0.002987
4,5,ALD1,0.002357


### Estimation of $p(S,A|ALD) = \frac{p(S,A,ALD)}{p(ALD)}$


In [16]:
p_sa_ald=pd.merge(ald_per_sexage,ald, how="inner", on='ALD')
p_sa_ald['p']=p_sa_ald['value_x']/p_sa_ald['value_y']
p_sa_ald=p_sa_ald[['sex','age','ALD','p']]

In [17]:
p_sa_ald

Unnamed: 0,sex,age,ALD,p
0,1,0,ALD1,0.001680
1,2,0,ALD1,0.001144
2,1,5,ALD1,0.001984
3,2,5,ALD1,0.001377
4,1,10,ALD1,0.001867
...,...,...,...,...
1195,2,85,ALD30,0.036427
1196,1,90,ALD30,0.011939
1197,2,90,ALD30,0.017589
1198,1,95,ALD30,0.002374


## Estimation of the join distribution

Here ... I suppose idependency between $D$ and $S, A$. It make the estimation of the join probability simple. More complex solutions can be found in "Fast and Flexible Inference of Join Distributions from their Marginals", Frogner & Poggio, ICML, 2019

In [18]:
p_dsa_ald=pd.merge(p_sa_ald,p_d_ald,how="inner",on="ALD")
p_dsa_ald['p']=p_dsa_ald['p_x']*p_dsa_ald['p_y']
p_dsa_ald=p_dsa_ald[['sex','age','ALD','dpt', 'p']]

In [19]:
p_dsa_ald

Unnamed: 0,sex,age,ALD,dpt,p
0,1,0,ALD1,01,1.356836e-05
1,1,0,ALD1,02,1.266641e-05
2,1,0,ALD1,03,1.207819e-05
3,1,0,ALD1,04,5.019507e-06
4,1,0,ALD1,05,3.960705e-06
...,...,...,...,...,...
121195,2,95,ALD30,971,2.490709e-05
121196,2,95,ALD30,972,2.235128e-05
121197,2,95,ALD30,973,5.087507e-06
121198,2,95,ALD30,974,2.924714e-05


In [20]:
#verification
np.sum(p_dsa_ald[p_dsa_ald['ALD']=="ALD10"]['p'])

0.9986591579511934


Now we can compute the following distribution
$p(ALD|D,S,A) = \frac{p(D,S,A|ALD)\times p(ALD)}{p(D,S,A)}$



In [23]:
P=pd.merge(p_dsa_ald,ald[['ALD','p']],how="inner",on="ALD")
P.rename(columns={'p_x':'p_dsa_ald','p_y':'p_ald'},inplace=True)
P=pd.merge(P,pop_ASD,how="inner",on=["sex","age","dpt"])
P.rename(columns={'pop':'p_dsa'},inplace=True)
P['p'] = P['p_dsa_ald']*P['p_ald']/P['p_dsa']
P=P[['sex','age','dpt','ALD','p']]
P

Unnamed: 0,sex,age,dpt,ALD,p
0,1,0,01,ALD1,0.000284
1,1,0,01,ALD2,0.000037
2,1,0,01,ALD3,0.000018
3,1,0,01,ALD4,0.000002
4,1,0,01,ALD5,0.001667
...,...,...,...,...,...
119995,2,95,974,ALD26,0.000477
119996,2,95,974,ALD27,0.000485
119997,2,95,974,ALD28,0.000169
119998,2,95,974,ALD29,0.000420


In [29]:
P.to_csv("ALD_p.csv")

In [24]:
#Vérification des marginales (sur les sex/age)
sex=1
age=40
ALD='ALD6'
print( np.sum(p_dsa_ald[(p_dsa_ald['sex']==sex) & (p_dsa_ald['age']==age) & (p_dsa_ald['ALD']==ALD)]['p']) )
print( float(p_sa_ald[(p_sa_ald['sex']==sex) & (p_sa_ald['age']==age) & (p_sa_ald['ALD']==ALD)]['p']) )

0.034604039028817785
0.03460403902881779


In [25]:
#Vérification des marginales (sur les départements)
dpt='90'
ALD="ALD7"
print( np.sum(p_dsa_ald[(p_dsa_ald['dpt']==dpt) & (p_dsa_ald['ALD']==ALD)]['p']) )
print( float(p_d_ald[(p_d_ald['dpt']==dpt) & (p_d_ald['ALD']==ALD)]['p']) )

0.000947989786630205
0.0009481941211964486


In [26]:
#Generation d'une liste d'ALD pour un profil patient donné
dpt='35'
age=75
sex=1
pALD=P[(P['dpt']==dpt) & (P['age']==age)& (P['sex']==sex)][["ALD",'p']]
list(pALD[pALD['p']>=np.random.rand(30)]['ALD'])

[]

In [27]:
pALD[pALD['p']>=np.random.rand(1)[0]]['ALD']


Series([], Name: ALD, dtype: object)

In [28]:
dpt='974'
age=95
sex=2
P[(P['dpt']==dpt) & (P['age']==age)& (P['sex']==sex)]

Unnamed: 0,sex,age,dpt,ALD,p
119970,2,95,974,ALD1,0.129731
119971,2,95,974,ALD2,0.003697
119972,2,95,974,ALD3,0.066372
119973,2,95,974,ALD4,6.5e-05
119974,2,95,974,ALD5,0.219907
119975,2,95,974,ALD6,0.001189
119976,2,95,974,ALD7,0.000105
119977,2,95,974,ALD8,0.281285
119978,2,95,974,ALD9,0.018312
119979,2,95,974,ALD10,0.000377
