In [1]:
from pandas import Series
import numpy as np
import math


# Standard deviation... deviations

Pandas defaults to the sample standard deviation (using n-1).
Rather than using the classic standard deviation calculation of `math.sqrt(((s - s.mean() ** 2).sum() / s.count())`
Numpy does not use the sample standard deviation.
You can use the regular standard deviation by passing `ddof=0` as an argument to `std`: `s.std(ddof=0)`
The `ddof` (delta degrees of freedom) is the number to subtract from `count` prior to division, so the default is `1`.

## Reminder about what stddev is...

In a normal distribution we expect:

- 68% of data to be within 1 stddev of the mean
- 95% of data to be within 2 stddev of the mean
- 99.7% of data to be within 3 stddev of the mean


# Aggregate methods in pandas

- `s.std()` - standard deviation
- `s.mean()` - mean, average
- `s.sum()`
- `s.count()`


In [2]:
# Weird aggregate data from data type issues:
s = Series("1234 5678 9012".split())
print(s)
print(f"The sum of this series is: {s.sum()}")

# The following is no longer true it seems. It instead raises a type error which is what
# would expect to do sensibly.
# print(f"The mean of this is {s.mean()}")
# print(
#     "Except it isn't because it concatenates the strings, then converts it to an int and divides by 3."
# )


0    1234
1    5678
2    9012
dtype: object
The sum of this series is: 123456789012


In [3]:
# we can have overruns on data types as they are not Python data types
s = Series([124, 125, 127], dtype=np.int8)
print(s + 1)


0    125
1    126
2   -128
dtype: int8


In [4]:
# changing the dtype needs to create a new series
s = Series("10 20 30".split())
print(s)
s = s.astype(np.int64)
print(s)


0    10
1    20
2    30
dtype: object
0    10
1    20
2    30
dtype: int64


In [5]:
# Exercise 2 - generate 10 test scores between 40 and 60, using an index from Sep to Jun
# Find the mean, and add the difference between the mean and 80

g = np.random.default_rng(0)
months = "Sep Oct Nov Dec Jan Feb Mar Apr May Jun"
s = Series(g.integers(40, 61, 10), index=months.split())
real_mean = s.mean()
print(f"Series mean is {real_mean} for\n{s}")
print(f"Adjusted scores for mean of 80 are:\n{s+(80-real_mean)}")


Series mean is 47.2 for
Sep    57
Oct    53
Nov    50
Dec    45
Jan    46
Feb    40
Mar    41
Apr    40
May    43
Jun    57
dtype: int64
Adjusted scores for mean of 80 are:
Sep    89.8
Oct    85.8
Nov    82.8
Dec    77.8
Jan    78.8
Feb    72.8
Mar    73.8
Apr    72.8
May    75.8
Jun    89.8
dtype: float64


In [None]:
# Extension 1. Grading against the curve.
# Break down the marks into grades, where 1 sd below is a C, 1 sd above is a B, above 1 sd is an A, below 1 sd is a D
g = np.random.default_rng(0)
months = "Sep Oct Nov Dec Jan Feb Mar Apr May Jun"
s = Series(g.integers(40, 61, 10), index=months.split())
real_mean = s.mean()
std_dev = s.std()

print(f"Mean: {real_mean}, SD: {std_dev:.2f}")
print(f"A scores:\n{s[s > real_mean + std_dev]}")
print(f"B scores:\n{s[(s <= real_mean + std_dev) & (s > real_mean)]}")
print(f"C scores:\n{s[(s <= real_mean) & (s > real_mean - std_dev)]}")
print(f"D scores:\n{s[s <= real_mean - std_dev]}")

# Extension 2. What months were the scores outside 2 SD?
print("Months outside 2 standard deviations.")
print(s[(s > real_mean + std_dev * 2) | (s < real_mean - std_dev * 2)])

# Extension 3. How close were the mean and median?
median = s.median()
print(f"Series mean: {real_mean} vs Series Median: {median}")
# Quite close 47.2 vs 45.5 so not much difference.
# If they were far apart then the standard deviation would differ quite
# a bit.


Mean: 47.2, SD: 6.66
A scores:
Sep    57
Jun    57
dtype: int64
B scores:
Oct    53
Nov    50
dtype: int64
C scores:
Dec    45
Jan    46
Mar    41
May    43
dtype: int64
D scores:
Feb    40
Apr    40
dtype: int64
Months outside 2 standard deviations.
Series([], dtype: int64)
Series mean: 47.2 vs Series Median: 45.5
