In [14]:
import numpy as np
import pandas as pd
from scipy.stats import f
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Test of proportions 

* 'sex' and 'smoker' are two categorical variables
* We want to see if the proportion of smokers in the female population is significantly less than it is in the male population

#### Ho = The proportions are equal
#### Ha = The two proportions are not equal

In [47]:
female_smokers = df[df['sex'] == 'female'].smoker.value_counts()[1]  # number of female smokers
male_smokers = df[df['sex'] == 'male'].smoker.value_counts()[1] # number of male smokers
n_females = df.sex.value_counts()[1] # number of females in the data
n_males = df.sex.value_counts()[0] #number of males in the data

In [94]:
print([female_smokers, male_smokers] , [n_females, n_males])
print(f' Proportion of smokers in females, males = {round(115/662,2)}%, {round(159/676,2)}% respectively')

[115, 159] [662, 676]
 Proportion of smokers in females, males = 0.17%, 0.24% respectively


The proportions are different but are they statistically significant?

In [102]:
from statsmodels.stats.proportion import proportions_ztest

stat, pval = proportions_ztest([female_smokers, male_smokers] , [n_females, n_males])

if pval < 0.05:
    print(f'With a p-value of {round(pval,4)} the difference is significant. aka |We reject the null|')
else:
    print(f'With a p-value of {round(pval,4)} the difference is not significant. aka |We fail to reject the null|')

With a p-value of 0.0053 the difference is significant. aka |We reject the null|


# Test of Variance

For chi square table: https://people.smp.uq.edu.au/YoniNazarathy/stat_models_B_course_spring_07/distributions/chisqtab.pdf

In [123]:
nineteen = df[df['age'] == 19]
nineteen.sex.value_counts()

male      35
female    33
Name: sex, dtype: int64

In [109]:
sample_male = nineteen[nineteen['sex'] == 'male'].bmi.iloc[:-2]   #excluding the last two elements to match the size 2 samples
sample_female = nineteen[nineteen['sex'] == 'female'].bmi


In [118]:
v1, v2 = np.var(sample_female) , np.var(sample_male)
print(v1,v2)

30.765708585858583 35.05058650137741


#### Variances of bmi of men is higher than it is for women. But is the difference statistically significant?

#### Ho : Variation in bmi of men and women is equal same
#### Ha : Variation in bmi of men is greater than it is in women

In [120]:
n = 33  # number of samples
dof = n - 1  # degrees of freedom
alpha = 0.05  # significance level
chi_critical = 46.19    # critical chi_squared statistic. From the table 

In [122]:
chi = (dof*v1)/v2

if chi < chi_critical:
    print("Since the test statistic is less than the critical value, we fail to reject the null")
else:
    print("Since the test statistic is more than the critical value, we reject the null")

Since the test statistic is less than the critical value, we fail to reject the null
