In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()

In [2]:
POP_SIZE = 300
DRINKERS = 50
NON_DRINKERS = POP_SIZE - DRINKERS

CANCER_DRINKERS = 10
CANCER_NON_DRINKERS = 10

coffee_cancer = pd.DataFrame({'coffee' : [CANCER_DRINKERS,DRINKERS - CANCER_DRINKERS],
                 'no_coffee' : [CANCER_NON_DRINKERS,NON_DRINKERS - CANCER_NON_DRINKERS]},
                             index=['cancer','no_cancer'])

coffee_cancer

Unnamed: 0,coffee,no_coffee
cancer,10,10
no_cancer,40,240


In [3]:
incidence = coffee_cancer / coffee_cancer.sum()
incidence.T

Unnamed: 0,cancer,no_cancer
coffee,0.2,0.8
no_coffee,0.04,0.96


In [4]:
rr = (incidence.T / incidence.T.shift(-1)).dropna()
rr

### coffee drinkers have 5 times risk for cancer compared to non-coffee drinkers ###
### coffee drinkers have only 83% of the cancer health compared to non-coffee drinkers ###

Unnamed: 0,cancer,no_cancer
coffee,5.0,0.833333


In [5]:
np.random.seed(888)

prop_smoke_drinkers = 0.25
prop_smoke_non_drinkers = 0.05

coffee = np.zeros(POP_SIZE).astype(int)
coffee[np.random.choice(np.arange(len(coffee)),replace=False,size=DRINKERS)] = 1

coffee_df = pd.DataFrame(coffee,columns=['coffee'])

coffee_df['smoker'] = 0
coffee_df['cancer'] = 0

### 25% of coffee drinkers smoke ###
coffee_df.loc[np.random.choice(coffee_df.loc[coffee_df['coffee'] == 1].index,
                               replace=False,size=int(prop_smoke_drinkers * DRINKERS)),
              'smoker'] = 1
### 5% of non_coffee drinkers smoke ###
coffee_df.loc[np.random.choice(coffee_df.loc[coffee_df['coffee'] == 0].index,
                               replace=False,size=int(prop_smoke_non_drinkers * NON_DRINKERS)),
              'smoker'] = 1

### assign 10 coffee drinkers cancer, random choice between smokers/non-smokers ###
coffee_df.loc[np.random.choice(
    coffee_df.loc[coffee_df['coffee'] == 1].index,
    replace=False,size=10),'cancer'] = 1

### assign 10 non_coffee drinkers cancer, random choice between smokers/non-smokers ###
coffee_df.loc[np.random.choice(
    coffee_df.loc[coffee_df['coffee'] == 0].index,
    replace=False,size=10),'cancer'] = 1


coffee_df

Unnamed: 0,coffee,smoker,cancer
0,0,0,0
1,0,0,0
2,1,0,0
3,0,0,0
4,0,0,0
...,...,...,...
295,0,0,0
296,0,0,0
297,1,0,0
298,0,0,0


In [6]:
coffee_df.loc[(coffee_df['coffee']==1) & (coffee_df['cancer'] == 1)]

Unnamed: 0,coffee,smoker,cancer
43,1,0,1
73,1,0,1
114,1,0,1
166,1,1,1
167,1,1,1
168,1,0,1
176,1,1,1
185,1,0,1
202,1,0,1
207,1,0,1


In [7]:
coffee_df.loc[(coffee_df['coffee']==0) & (coffee_df['cancer'] == 1)]

Unnamed: 0,coffee,smoker,cancer
10,0,0,1
11,0,1,1
29,0,0,1
97,0,0,1
113,0,0,1
195,0,0,1
216,0,0,1
243,0,0,1
287,0,0,1
293,0,0,1


In [8]:
idx = pd.IndexSlice

smoker_desc = coffee_df.groupby(['coffee','smoker']).agg(['sum','count'])
smoker_desc['incidence'] = smoker_desc.loc[:,idx['cancer','sum']] / smoker_desc.loc[:,idx['cancer','count']]
smoker_desc

Unnamed: 0_level_0,Unnamed: 1_level_0,cancer,cancer,incidence
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,Unnamed: 4_level_1
coffee,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0,9,238,0.037815
0,1,1,12,0.083333
1,0,7,38,0.184211
1,1,3,12,0.25


In [9]:
non_smoker_weights = smoker_desc.loc[idx[:,0],('cancer','count')]
non_smoker_weights

coffee  smoker
0       0         238
1       0          38
Name: (cancer, count), dtype: int64

In [10]:
non_smoker_incidence = smoker_desc.loc[idx[:,0],'incidence']
non_smoker_incidence

coffee  smoker
0       0         0.037815
1       0         0.184211
Name: incidence, dtype: float64

In [11]:
non_smoker_weighted_avg = np.average(non_smoker_incidence,weights=non_smoker_weights)
non_smoker_weighted_avg

0.057971014492753624

In [12]:
smoker_weights = smoker_desc.loc[idx[:,1],('cancer','count')]
smoker_weights

coffee  smoker
0       1         12
1       1         12
Name: (cancer, count), dtype: int64

In [13]:
smoker_incidence = smoker_desc.loc[idx[:,1],'incidence']
smoker_incidence

coffee  smoker
0       1         0.083333
1       1         0.250000
Name: incidence, dtype: float64

In [14]:
smoker_weighted_avg = np.average(smoker_incidence,weights=smoker_weights)
smoker_weighted_avg

0.16666666666666666

In [15]:
### SUM gives number of 'positive', COUNT gives total individuals ###
coffee_desc = coffee_df.groupby('smoker').agg(['sum','count'])
coffee_desc

Unnamed: 0_level_0,coffee,coffee,cancer,cancer
Unnamed: 0_level_1,sum,count,sum,count
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,38,276,16,276
1,12,24,4,24


In [16]:
coffee_desc.loc[:,idx['coffee','incidence']] = coffee_desc.loc[:,idx['coffee','sum']] / coffee_desc.loc[:,idx['coffee','count']]
coffee_desc.loc[:,idx['cancer','incidence']] = coffee_desc.loc[:,idx['cancer','sum']] / coffee_desc.loc[:,idx['cancer','count']]

coffee_desc.loc[:,idx['cancer','sum']] / coffee_desc.loc[:,idx['cancer','count']]

smoker
0    0.057971
1    0.166667
dtype: float64

In [17]:
coffee_desc

Unnamed: 0_level_0,coffee,coffee,cancer,cancer,coffee,cancer
Unnamed: 0_level_1,sum,count,sum,count,incidence,incidence
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,38,276,16,276,0.137681,0.057971
1,12,24,4,24,0.5,0.166667


In [18]:
smoker_cancer_rr = coffee_desc.loc[1,idx['cancer','incidence']] / coffee_desc.loc[0,idx['cancer','incidence']]
smoker_cancer_rr

2.875

In [19]:
coffee_df.corr()

Unnamed: 0,coffee,smoker,cancer
coffee,1.0,0.263752,0.239046
smoker,0.263752,1.0,0.118217
cancer,0.239046,0.118217,1.0


In [20]:
smokers = coffee_df['smoker'].sum()
print (smokers)
non_smokers = coffee_df.loc[coffee_df['smoker'] == 0].count()['smoker']
non_smokers

24


276

In [21]:
smoker_w_cancer = len(coffee_df.loc[(coffee_df['smoker'] == 1) & (coffee_df['cancer'] == 1)])
smoker_w_cancer

4

In [22]:
non_smoker_w_cancer = len (coffee_df.loc[(coffee_df['smoker'] == 0) & (coffee_df['cancer'] == 1)])
non_smoker_w_cancer

16

In [23]:
smoker_incidence = smoker_w_cancer / smokers
smoker_incidence

0.16666666666666666

In [24]:
non_smoker_incidence = non_smoker_w_cancer / non_smokers
non_smoker_incidence

0.057971014492753624

In [25]:
rr = smoker_incidence / non_smoker_incidence
rr

2.875

In [26]:
coffee_df.loc[coffee_df['cancer'] == 1].sum()

coffee    10
smoker     4
cancer    20
dtype: int64

In [27]:
from graphviz import Digraph

dot = Digraph()
dot.edge('Coffee','Cancer',label='x',color='red')
dot.format='jpg'
dot.render('coffee_cancer',renderer='cairo')


'coffee_cancer.cairo.jpg'

In [28]:
dot.edge('Smoking','Cancer')
dot.edge('Coffee','Smoking')

dot.render('coffee_smoking_cancer',renderer='cairo')
dot.view()

'coffee_smoking_cancer.jpg'