

# STATISTICS FORMULAS



In [None]:
import statistics
import math
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest

## Measure of spread (dispersion)

In [None]:
salary = [102, 33, 26, 27, 30, 25, 33, 33, 24]

print("Range: ", (np.max(salary)-np.min(salary)))

print("Variance: ", (np.var(salary)))

print("Std: ", (np.std(salary)))

print("Q1:", (np.percentile(salary, 25)))

print("Q2:", (np.percentile(salary, 50)))  #q2 is also called median

print("Q3:", (np.percentile(salary, 75)))

print("IQR:", (stats.iqr(salary)))

## Covariance - Correlation

In [None]:
temp=[93, 84, 82, 78, 98, 70]

number_of_people=[13, 10, 11, 8, 15, 9]

print("covariance: ", np.cov(temp, number_of_people))

print("correlation: ", np.corrcoef(temp, number_of_people))

print("correlation coefficient and p-value: ", stats.pearsonr(temp, number_of_people))


## 3 MEANS

In [None]:
population = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
sample = [1, 3, 8, 11]

In [None]:
# Population mean (µ) = ∑X / N
mu = sum(population) / len(population)

# Sample mean (x̄) = Σx / n
x_bar = sum(sample) / len(sample)

# The mean of the sampling distribution of the sample means (SDSM)
mu_x_bar = mu # µ(x̄) = µ

print("Population mean: ", mu)
print("Sample mean: ", x_bar)
print("SDSM mean: ", mu_x_bar)


## 3 STANDARD DEVIATIONS

1. Population standard deviation (σ-sigma)


![Screenshot%20from%202024-04-06%2018-16-42.png](attachment:Screenshot%20from%202024-04-06%2018-16-42.png)

2. Sample standard deviation (s)

![Screenshot%20from%202024-04-06%2018-18-08.png](attachment:Screenshot%20from%202024-04-06%2018-18-08.png)

3. The standard deviation of the sample means

Standard Error of the Mean (SEM / σ(x̄) / sigma_x_bar) = Population std (σ) / sqrt(n)

sem = sigma / sqrt(n)  

In [None]:
# Population Standard Deviation

sum_squared_diff = 0
for x in data:
    sum_squared_diff += (x - mu) ** 2

population_std_dev = math.sqrt(sum_squared_diff / len(population))

print("Population Standard Deviation:", population_std_dev)

In [None]:
population_std_dev = statistics.pstdev(population)

print("Population Standard Deviation:", population_std_dev)

In [None]:
# Sample Standard Deviation

sample_std_dev = statistics.stdev(sample)

print("Sample Standard Deviation:", sample_std_dev)

In [None]:
sum_squared_diff = 0
for x in data:
    sum_squared_diff += (x - x_bar) ** 2

sample_variance = sum_squared_diff / (len(sample) - 1)
sample_std_dev = sample_variance ** 0.5

print("Sample Standard Deviation:", sample_std_dev)

In [None]:
# Standard Error of the Mean (SEM)

sample_mean = statistics.mean(data)
sample_std_dev = statistics.stdev(data)
sample_size = len(data)

sem = sample_std_dev / math.sqrt(sample_size)

print("Standard Error of the Mean (SEM):", sem)


## Z Score

In [None]:
def calculate_z_score(x, mean, std_dev):
    return (x - mean) / std_dev

mean = 50  # Mean of the distribution
std_dev = 10  # Standard deviation of the distribution
x = 60  # Value for which we want to calculate the z-score

z_score = calculate_z_score(x, mean, std_dev)
print("Z-score:", z_score)


## Z Score for sampling distribution of sample mean

In [None]:
z_score = (x_bar - mu) / (sigma/sqrt(n))

In [None]:
def calculate_z_score_sample_mean(sample_mean, population_mean, population_std_dev, sample_size):
    return (sample_mean - population_mean) / (population_std_dev / math.sqrt(sample_size))

# Example values
sample_mean = 100
population_mean = 95
population_std_dev = 10
sample_size = 30

z_score = calculate_z_score_sample_mean(sample_mean, population_mean, population_std_dev, sample_size)
print("Z-score for Sampling Distribution of Sample Mean:", z_score)


## Confidence Interval for the mean

In [None]:
# Point estimate is x_bar: The sample mean, x_bar, is a point estimate of the population mean mu!
# The point estimate is simply the midpoint of the confidence interval.
# Confidence Interval = x_bar ± margin of error
# Margin of error = Critical Value(Z Score/Reliability Factor) * Standard Error 

In [None]:
x_bar = 
z_score = (x_bar - mu) / (sigma/sqrt(n))
std = 
n = 
sem = std / sqrt(n)  # standard error
moe = z_score * sem  # margin of error

In [None]:
CI = (xbar - z * std/sqrt(n)), (xbar + z * std/sqrt(n))

In [None]:
sample_mean = statistics.mean(data)
sample_std_dev = statistics.stdev(data)
sample_size = len(data)
confidence_level = 0.95

z_score = 1.96  # For a 95% confidence interval

standard_error = sample_std_dev / math.sqrt(sample_size)
margin_of_error = z_score * standard_error

lower_bound = sample_mean - margin_of_error
upper_bound = sample_mean + margin_of_error

print("95% Confidence Interval:", (lower_bound, upper_bound))


In [None]:
# calculate with scipy

ci_z = stats.norm.interval(alpha, loc, scale)

# loc = mean, scale = sem


## Confidence Interval for the proportion

In [None]:
# point estimate p_hat: sample proportion
# The point estimate is simply the midpoint of the confidence interval.
# Confidence Interval = p_hat ± margin of error
# Margin of error = Critical Value(Z Score/Reliability Factor) * Standard Error 

![Screenshot%20from%202024-04-07%2000-23-29.png](attachment:Screenshot%20from%202024-04-07%2000-23-29.png)

![Screenshot%20from%202024-04-07%2000-54-52.png](attachment:Screenshot%20from%202024-04-07%2000-54-52.png)

In [None]:
sample_proportion = 0.6  # Sample proportion
sample_size = 100  # Sample size
confidence_level = 0.95  # Confidence level

z_score = 1.96  # Z-score for a 95% confidence interval

standard_error = math.sqrt((sample_proportion * (1 - sample_proportion)) / sample_size)
margin_of_error = z_score * standard_error

lower_bound = sample_proportion - margin_of_error
upper_bound = sample_proportion + margin_of_error

print("95% Confidence Interval for Proportion:", (lower_bound, upper_bound))

In [None]:
# calculate with scipy

ci_t = stats.t.interval(alpha, df-1, loc, scale)

# loc = mean, scale = sem, df = degree of freedom


## One sample z test

In [None]:
# Performed when the population means and standard deviation are known.

In [None]:
# 1. set the hypothesis
# Ho: 
# Ha: 

In [None]:
# 2. calculate the test statistic

z = (x_bar - mu)/(sigma/sqrt(n))

In [None]:
# 3. calculate the p value

p_value = 1- stats.norm.cdf(z)

In [None]:
# 4. check the hypothesis and make a decision

alpha = 0.05

if p_value < alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))


## One sample t test

In [None]:
# Performed when the population standard deviation is not known and sample size is less than 30.
# But we can perform also when the sample size is above 30

In [None]:
# 1. set the hypothesis
# Ho: 
# Ha: 

In [None]:
# 2. calculate the test statistic

t = (x_bar - mu) / (s/sqrt(n))

In [None]:
# 3. calculate the p value

p_value = stats.t.cdf(t, n-1)

In [None]:
# 4. check the hypothesis and make a decision

if p_value < alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))


## One sample t test with ttest_1samp

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_1samp.html#scipy-stats-ttest-1samp

In [None]:
# Calculate the T-test for the mean of ONE group of scores.
# This is a test for the null hypothesis that the expected value (mean) of a sample of independent observations a is equal to the given population mean, popmean.

In [None]:
# 1. set the hypothesis
# Ho: 
# Ha: 

In [None]:
# 2. calculate the p value

onesample = stats.ttest_1samp(a, popmean, alternative)

# a (sample observations)
# popmean (Expected value in null hypothesis)
# alternative (Defines the alternative hypothesis: two-sided, less, greater))
# result is t_statistics and p_value 

In [None]:
# 4. check the hypothesis and make a decision

alpha = 0.05
if onesample.pvalue<alpha:
    print('At {} level of significance, we can reject the null hypothesis in favor of alternative hypothesis.'.format(alpha))
else:
    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))


## Independent Samples T Test

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html#scipy.stats.ttest_ind

In [None]:
# Calculate the T-test for the means of two independent samples of scores.
# This is a test for the null hypothesis that 2 independent samples have identical average (expected) values. 
# This test assumes that the populations have identical variances by default.

In [None]:
# 1. set the hypothesis
# Ho: 
# Ha: 

In [None]:
# 2.Perform Levene Test for equal variances

#H0: The population variances are equal
#H1: There is a difference between the variances in the population
#The small p-value suggests that the populations do not have equal variances.(equal_var = False)
#The large p-value suggests that the populations have equal variances.(equal_var = True)

leveneTest = stats.levene(group1, group2)
leveneTest

In [None]:
# 3. Calculate the T-test for the means of two independent samples of scores.
# Calculate test statistics and P VALUE

indTest = stats.ttest_ind(group1, group2, equal_var = True/False)

# result is t_statistics and p_value and df

In [None]:
# 4. check the hypothesis and make a decision

alpha = 0.05

if indTest.pvalue < alpha:
    print("Reject the null. At the α={}, we have sufficient evidence that two groups differ from each.".format)
else:
    print("Fail to reject the null.")


## Paired (Dependent) Samples T Test

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html#scipy.stats.ttest_rel

In [None]:
# Calculate the t-test on TWO RELATED samples of scores, a and b.
# This is a test for the null hypothesis that two related or repeated samples have identical average (expected) values.

In [None]:
# 1. set the hypothesis
# Ho: 
# Ha: 

In [None]:
# 2. Calculate p_value

pairedtest = stats.ttest_rel(group1, group2, alternative)

# alternative{‘two-sided’, ‘less’, ‘greater’}, optional, Defines the alternative hypothesis. Default is ‘two-sided’
# result is t_statistics and p_value and df

In [None]:
# 4. check the hypothesis and make a decision

alpha = 0.05

if pairedtest.pvalue < alpha:
    print("Reject the Null. The data indicates there is difference between two groups.")
else:
    print("Fail to reject!")


## ANOVA

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html#scipy.stats.f_oneway

In [None]:
# The one-way ANOVA,tests the null hypothesis that two or more groups have the same population mean. 
# The test is applied to samples from two or more groups, possibly with differing sizes.

In [None]:
# 1. set the hypothesis
# H0: μ1 = μ2 = μ3 = μ4
# Ha: At least one μi is different

In [None]:
# 2. Calculate p_value

anova_test = stats.f_oneway(samples)

# result is F statistics and the associated p-value from the F distribution.

In [None]:
# Check the hypothesis and make a decision

alpha = 0.05

if anova_test.pvalue < alpha:
    print("We can reject H0 at 0.05 significance level and conclude that the differences among group means are statistically significant")
else:
    print("Fail to reject")


## CHI_SQUARE

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html#scipy.stats.chi2_contingency

In [None]:
# Chi-square test of independence of variables in a contingency table.

# This function computes the chi-square statistic and p-value for the hypothesis test of independence of the observed frequencies in the contingency table observed.

In [None]:
# 1. Set the hypothesis
# Ho: The two categorical variables are independent.
# Ha: The two categorical variables are dependent.

In [None]:
# 2. Make a cross tab

crosstab = pd.crosstab(index, columns)

In [None]:
# 3. Calculate p_value

stat, p, dof, expected = stats.chi2_contingency(crosstab)

# crosstab is contingency table
# result is test statistics, p_value, dof, expected frequencies

In [None]:
# 4. Check the hypothesis and make a decision

print('stat=%.3f, p=%.4f' % (stat, p))

if p > 0.05:
    print('Fail to reject Ho')
else:
    print('We reject the null hypothesis. The two categorical variables are dependent.')


## A/B TEST

In [None]:
# 1. set the hypothesis
# Ho: There is no significant difference between success rates
# Ha: There is significant difference between success rates

In [None]:
# 2. Calculate success rate, needed in manuel calculation
success / observation

In [None]:
# 3. Calculate p_value
# use ztest for proportions from statsmodels.stats.proportion

stat, pval = proportions_ztest(counts, nobs)

# returns z statistics and p_value
# counts is the number of successes in nobs trials for each independent sample. 
# nobs is the number of trials or observations, with the same length as count.

In [None]:
# 4. Check the hypothesis and make a decision

if pval > 0.05:
    print('Fail to reject Ho')
else:
    print('We reject the null hypothesis. The two categorical variables are dependent.')

https://www.statsmodels.org/dev/generated/statsmodels.stats.proportion.proportions_ztest.html

In [None]:
# we can make one sample and two sample ztest for proportion with that function