# Descriptive Statistics

## Mean

In [None]:
sample = [1, 3, 2, 5, 7, 0, 2, 3]
mean = sum(sample) / len(sample)
print(mean)

2.875


## Weighted Mean

In [None]:
sample = [90, 80, 63, 87]
weights = [.20, .20, .20, .40]
weighted_mean = sum(s * w for s,w in zip(sample, weights)) / sum(weights)
print(weighted_mean)

81.4


## Median

In [None]:
sample = [0, 1, 5, 7, 9, 10, 14]
def median(values):
  ordered = sorted(values)
  print(ordered)
  n = len(ordered)
  mid = int(n / 2) - 1 if n % 2 == 0 else int(n/2)

if n % 2 == 0:
  return (ordered[mid] + ordered[mid+1]) / 2.0
else:
  return ordered[mid]
print(median(sample))

## Mode

In [None]:
from collections import defaultdict
sample = [1, 3, 2, 5, 7, 0, 2, 3]
def mode(values):
    counts = defaultdict(lambda: 0)

    for s in values:
      counts[s] += 1

    max_count = max(counts.values())
    modes = [v for v in set(values) if counts[v] == max_count]
    return modes

print(mode(sample))

[2, 3]


## Variance
### Measure of how spread out the data is.


In [None]:
data = [0, 1, 5, 7, 9, 10, 14]
def variance(values):
  mean = sum(values) / len(values)
  _variance = sum((v - mean) ** 2 for v in values) / len(values)
  return _variance

print(variance(data))

21.387755102040813


## Standard Deviation

### Square root of the variance, gives us the standard deviation

In [None]:
from math import sqrt
# Number of pets each person owns
data = [0, 1, 5, 7, 9, 10, 14]

def variance(values):
    mean = sum(values) / len(values)
    _variance = sum((v - mean) ** 2 for v in values) / len(values)
    return _variance

def std_dev(values):
    return sqrt(variance(values))

print(std_dev(data))

4.624689730353898


## Normal Distribution


In [None]:
def normal_pdf(x: float, mean: float, std_dev: float) -> float:
 return (1.0 / (2.0 * math.pi * std_dev ** 2) ** 0.5) * math.exp(-1.0 * ((x - mean) ** 2 / (2.0 * std_dev ** 2)))

## Random Number Generation

In [None]:
import random
from scipy.stats import norm
for i in range(0,1000):
 random_p = random.uniform(0.0, 1.0)
 random_weight = norm.ppf(random_p, loc=64.43, scale=2.99)
 print(random_weight)

65.28603065767983
60.81660263982803
60.73225725100315
65.00044733295799
62.662049112103126
69.42846908678362
57.65749027142839
60.41244996212469
63.469193403454376
60.42247100140582
64.70866936641636
64.87098412345075
66.02507675730956
60.97782254076579
63.8653388577827
64.31843032904214
63.70805078678285
62.11516903181444
64.83570612790197
70.37460986129709
65.03522611047957
58.91343874458489
67.5237422237811
67.79517923176506
61.95122954303889
67.26528939381393
62.27102025679862
63.81787577152884
60.54567108439991
63.11032331576096
67.19865476141914
60.963448931764866
62.66125163114793
64.42050736836576
68.57375920502753
63.866289279566956
60.111310842286194
64.62837729458725
62.1295646737763
69.46639519763092
61.455346607357086
65.69532350866268
62.830211284064674
65.92609284968754
66.57163083882978
66.51798302610108
65.28996753770652
64.12531346524919
62.86160429953919
63.28013369164582
62.526537785258995
66.69677988856452
62.8213063355154
65.9861181399734
68.38849742505413
67.2768

## Z Scores

In [None]:
def z_score(x, mean, std):
 return (x - mean) / std
def z_to_x(z, mean, std):
 return (z * std) + mean
mean = 140000
std_dev = 3000
x = 150000


z = z_score(x, mean, std_dev)
back_to_x = z_to_x(z, mean, std_dev)
print("Z-Score: {}".format(z)) # Z-Score: 3.333
print("Back to X: {}".format(back_to_x))

## Central Limit Theorem

The individual numbers in the samples alone will not create a normal distribution.
The distribution will be flat where any number is equally likely
(known as a uniform distribution).
But when we group them as samples and average them, they form a normal distribution.
This is because of the central limit theorem.

In [None]:
import random
import plotly.express as px
sample_size = 31
sample_count = 1000
# Central limit theorem, 1000 samples each with 31
# random numbers between 0.0 and 1.0
x_values = [(sum([random.uniform(0.0, 1.0) for i in range(sample_size)]) / \
 sample_size)
            for _ in range(sample_count)]
y_values = [1 for _ in range(sample_count)]
px.histogram(x=x_values, y = y_values, nbins=20).show()

## Confidence Intervals
A confidence interval is a range calculation showing how confidently we believe a sample mean (or other parameter) falls in a range for the population mean.

### ***Requirements***
- Critical z Value
- Level of Confidence
- Margin of Error


In [None]:
from math import sqrt
from scipy.stats import norm
def critical_z_value(p):
 norm_dist = norm(loc=0.0, scale=1.0)
 left_tail_area = (1.0 - p) / 2.0
 upper_area = 1.0 - ((1.0 - p) / 2.0)
 return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)

def confidence_interval(p, sample_mean, sample_std, n):
 # Sample size must be greater than 30
 lower, upper = critical_z_value(p)
 lower_ci = lower * (sample_std / sqrt(n))
 upper_ci = upper * (sample_std / sqrt(n))
 return sample_mean + lower_ci, sample_mean + upper_ci
print(confidence_interval(p=.95, sample_mean=64.408, sample_std=2.05, n=31))


# t distribution
from scipy.stats import t
# get critical value range for 95% confidence
# with a sample size of 25
n = 25
lower = t.ppf(.025, df=n-1)
upper = t.ppf(.975, df=n-1)
print(lower, upper)


(63.68635915701992, 65.12964084298008)
-2.063898561628021 2.0638985616280205


*“based on my sample of 31 golden retriever weights with sample mean 64.408 and sample standard deviation of 2.05, I am 95% confident the population mean lies between 63.686 and 65.1296.”*

## Hypothesis Testing

### - One Tailed Test

In [None]:
from scipy.stats import norm

# Cold has 18 day mean recovery, 1.5 std dev
mean = 18
std_dev = 1.5

# 95% probability recovery time takes between 15 and 21 days.
x = norm.cdf(21, mean, std_dev) - norm.cdf(15, mean, std_dev)

print(x)

0.9544997361036416


### - Two Tailed Test
Two-tailed tests are preferable in most cases.
They tend to be more reliable and not bias the hypothesis in just one direction.

In [1]:
from scipy.stats import norm

# Cold has 18 day mean recovery, 1.5 std dev
mean = 18
std_dev = 1.5

# Probability of 16 or less days
p1 = norm.cdf(16, mean, std_dev)

# Probability of 20 or more days
p2 = 1.0 - norm.cdf(20, mean, std_dev)

# P-value of both tails
p_value = p1 + p2
print(p_value)

0.18242243945173575


## Value range with a T-distribution

In [None]:
from scipy.stats import t

# get critical value range for 95% confidence
# with a sample size of 25
n = 25

lower = t.ppf(.025, df=n-1)
upper = t.ppf(.975, df=n-1)

print(lower, upper)