In [1]:
import math
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.stats import norm

# Chapter 1: Introduction to Data

## 1.1 Case Study: using stents to prevent strokes

In [2]:
# retrieve datasets
stent30 = pd.read_csv(
    'OI_Resources/stent30.csv',
    header=0,
    names=['group', '0_to_30_days']
)
stent365 = pd.read_csv(
    'OI_Resources/stent365.csv',
    header=0,
    names= ['group', '0_to_365_days']
)

# create stents dataset
stents = pd.concat(
    [stent30, stent365['0_to_365_days']],
    axis='columns'
)

# dataset rows and columns
print(stent30.shape, stent365.shape, stents.shape)
# sample 8 records
display(stents.sample(8))
# patient outcomes for treatment groups
# display(stents[['group', '0_to_30_days']].value_counts())
# display(stents[['group', '0_to_365_days']].value_counts())

(451, 2) (451, 2) (451, 3)


Unnamed: 0,group,0_to_30_days,0_to_365_days
257,control,no event,no event
418,control,no event,no event
171,treatment,no event,no event
328,control,no event,no event
121,treatment,no event,no event
410,control,no event,no event
143,treatment,no event,no event
335,control,no event,no event


In [3]:
# patient outcomes for treatment groups
month_outcomes = pd.crosstab(stents['group'], stents['0_to_30_days'])
year_outcomes = pd.crosstab(stents['group'], stents['0_to_365_days'])
display(month_outcomes)
display(year_outcomes)

0_to_30_days,no event,stroke
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,214,13
treatment,191,33


0_to_365_days,no event,stroke
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,199,28
treatment,179,45


**Summary statistic**: A single number summarizing a large amount of data.

In [4]:
# proportion of patients in control group who had stroke within 1 year
year_control_stroke_prop = round((year_outcomes.loc['control', 'stroke']\
/ year_outcomes.loc['control'].sum())*100, 2)
# proportion of patients in treatment group who had stroke within 1 year
year_treatment_stroke_prop = round((year_outcomes.loc['treatment', 'stroke']\
/ year_outcomes.loc['treatment'].sum())*100, 2)

print('Proportion of strokes in 1-year control group:', f'{year_control_stroke_prop}%')
print('Proportion of strokes in 1-year treatment group:', f'{year_treatment_stroke_prop}%')

# difference in proportions (treatment - control)
year_stroke_prop_difference = year_treatment_stroke_prop - year_control_stroke_prop

print('Difference in stroke proportions:',\
    f'{abs(year_stroke_prop_difference)}%',\
    'more strokes in',\
    'treatment' if year_stroke_prop_difference >= 0 else 'control',
    'group'
)

Proportion of strokes in 1-year control group: 12.33%
Proportion of strokes in 1-year treatment group: 20.09%
Difference in stroke proportions: 7.76% more strokes in treatment group


## 1.1 Exercises

# Chapter 2: Summarizing Data

# Chapter 3: Probability

# Chapter 4: Distributions of Random Variables

In [5]:
# probability of finding first success in 
# Nth trial if
# P is probability of success and
# 1-P is probability of failure
def geometric_distribution(p:float, n:int):
    from math import sqrt

    probability = p * (1-p)**(n-1)
    mean = 1 / p
    variance = (1-p)/(p**2)
    std = math.sqrt(variance)

    return probability, mean, variance, std

In [6]:
# GP 4.23
print(round(1/0.7, 2))

1.43


In [7]:
prob = 0.7
p_list = []
for i in range(1, 4):
    p, mu, sigma, std = geometric_distribution(prob, i)
    p_list.append(p)
print(sum(p_list))
p_complement = (1-prob)**3
print(1-p_complement)

0.973
0.973


In [8]:
# 4.11a
prob = 0.25
mu = 1 / prob
std = math.sqrt((1-prob)/(prob**2))
print(prob, mu, std)

0.25 4.0 3.4641016151377544


In [9]:
#4.11b
prob = 1/6
mu = 1 / prob
std = math.sqrt((1-prob)/(prob**2))
print(prob, mu, std)

0.16666666666666666 6.0 5.477225575051661


In [10]:
#4.12a
with_replacement = (5/10)**2
without_replacement = (5/10) * (4/9)

print(with_replacement)
print(without_replacement)

0.25
0.2222222222222222


In [11]:
#4.12b
with_replacement = (5000/10000)**2
without_replacement = (5000/10000) * (4999/9999)

print(with_replacement)
print(without_replacement)

0.25
0.24997499749974997


In [12]:
#4.13a
blue_eyes = 0.125
brown_eyes = 0.75
green_eyes = 0.125

n_children = 3

prob = (1-blue_eyes)**(n_children-1) * blue_eyes
print(prob)

0.095703125


In [13]:
#4.13b
mu = 1 / blue_eyes
std = math.sqrt((1-blue_eyes)/(blue_eyes**2))
print(mu, std)

8.0 7.483314773547883


In [14]:
#4.14a
defect = 0.02
n_failure = 10

prob = (1-defect)**(n_failure-1) * defect
print(prob)

0.016674955242602995


In [15]:
#4.14b
prob = (1-defect)**100
print(prob)

0.13261955589475294


In [16]:
#4.14c
mu = 1 / defect
std = math.sqrt((1-defect)/(defect**2))
print(mu, std)

50.0 49.49747468305833


In [17]:
# 4.14d
defect = 0.05
mu = 1 / defect
std = math.sqrt((1-defect)/(defect**2))
print(mu, std)

20.0 19.493588689617926


In [18]:
# probability of finding k successes in n trials
def binomial_distribution(p:float, n:int, k:int):
    prob = math.comb(n, k) * (p**k) * ((1-p)**(n-k))
    mu = n * p
    var = n * p * (1-p)
    std = math.sqrt(var)

    return prob, mu, var, std

In [19]:
p = 0.7
n = 8
k = 5

results = [binomial_distribution(p, n, k)]

print(results[0])

(0.25412184000000004, 5.6, 1.6800000000000002, 1.2961481396815722)
