# Essential Math for Data Science

## 3. Descriptive and Inferential Statistics

### Mean and Weighted Mean

In [1]:
# Number of pets each person owns
sample = [1, 3, 2, 5, 7, 0, 2, 3]

mean = sum(sample) / len(sample)

print(mean)

2.875


In [2]:
# Three exams of 0.20 weight each and final exam of 0.40 weight
sample = [90, 80, 63, 87]
weights = [0.20, 0.20, 0.20, 0.40]

weighted_mean = sum(s * w for s, w in zip(sample, weights)) / sum(weights)

print(weighted_mean)

81.4


In [3]:
# Three exams of 0.20 weight each and final exam of 0.40 weight
sample = [90, 80, 63, 87]
weights = [1, 1, 1, 2]

weighted_mean = sum(s * w for s, w in zip(sample, weights)) / sum(weights)

print(weighted_mean)

81.4


### Median

In [4]:
# Number of pets each person owns
sample = [0, 1, 5, 7, 9, 10, 14]

def median(values):
    ordered = sorted(values)
    print(ordered)
    n = len(ordered)
    mid = int(n / 2) - 1 if n % 2 == 0 else int(n / 2)
    
    if n % 2 == 0:
        return (ordered[mid] + ordered[mid+1]) / 2.0
    else:
        return ordered[mid]
    
print(median(sample))

[0, 1, 5, 7, 9, 10, 14]
7


### Mode

In [5]:
from collections import defaultdict

sample = [1, 3, 2, 5, 7, 0, 2, 3]

def mode(values):
    counts = defaultdict(lambda: 0)
    
    for s in values:
           counts[s] += 1
            
    max_count = max(counts.values())
    modes = [v for v in set(values) if counts[v] == max_count]
    return modes

print(mode(sample))

[2, 3]


In [8]:
from collections import defaultdict

# Create a defaultdict with default value 0
counts = defaultdict(lambda: 0)

# Increment counts for different keys
counts['apple'] += 1
counts['banana'] += 1
counts['apple'] += 1

print(counts) 

defaultdict(<function <lambda> at 0x000001BDF917B550>, {'apple': 2, 'banana': 1})


### Population Variance and Standard Deviation

In [11]:
data = [0, 1, 5, 7, 9, 10, 14]

def variance(values):
    mean = sum(values) / len(values)
    _variance = sum((v - mean) ** 2 for v in values) / len(values)
    return _variance

print(round(variance(data), 2))

21.39


In [12]:
from math import sqrt

data = [0, 1, 5, 7, 9, 10, 14]

def variance(values):
    mean = sum(values) / len(values)
    _variance = sum((v - mean) ** 2 for v in values) / len(values)
    return _variance

def std_dev(values):
    return sqrt(variance(values))

print(round(std_dev(data), 2))

4.62


### Sample Variance and Standard Deviation

In [28]:
from math import sqrt

data = [0, 1, 5, 7, 9, 10, 14]

def variance(values, is_sample: bool = False):
    mean = sum(values) / len(values)
    _variance = sum((v - mean) ** 2 for v in values) / (len(values) - (1 if is_sample else 0))
    
    return _variance

def std_dev(values, is_sample: bool = False):
    return sqrt(variance(values, is_sample))

print(f'VARIANCE = {round(variance(data, is_sample=True), 2)}')
print(f'STD DEV = {round(std_dev(data, is_sample=True), 2)}')

VARIANCE = 24.95
STD DEV = 5.0


### The Probability Density Function (PDF)

In [30]:
# normal distribution, returns likelihood
def normal_pdf(x: float, mean: float, std_dev: float) -> float:
    return (1.0 / (2.0 * math.pi * std_dev ** 2) ** 0.5) * math.exp(-1.0 * ((x - mean) ** 2 / (2.0 * std_dev ** 2)))

### The Cumulative Distribution Function (CDF)

In [31]:
from scipy.stats import norm

mean = 64.43
std_dev = 2.99

x = norm.cdf(64.43, mean, std_dev)

print(x)

0.5


In [38]:
from scipy.stats import norm

mean = 64.43
std_dev = 2.99

x = norm.cdf(67, mean, std_dev)

print(f'{round(x*100,2)}%')

80.5%


In [40]:
from scipy.stats import norm

mean = 64.43
std_dev = 2.99

x = norm.cdf(66, mean, std_dev) - norm.cdf(62, mean, std_dev)

print(f'{round(x*100,2)}%')

49.2%
