In [1]:
from pandas import Series
import numpy as np
import math


In [None]:
g = np.random.default_rng(0)
# normal's args are (mean, std dev, number of values)
s = Series(g.normal(0, 100, 100_000))

# descriptive stats
print(f"Mean: {s.mean()}")
print(f"Median: {s.median()}")
print(f"Std dev: {s.std()}")
print(f"Delta: {abs(s.mean() - s.median())}")

# apparently we can just do this :laugh:
print(s.describe())
# the s.quantile(n) method works for the 25%, 50%, or 75% percentile
# too, like s.quantile(0.25)

# replace the minimum value with 5 times the maximum
s.loc[s == s.min()] = s.max() * 5

print(f"New mean: {s.mean()}")
print(f"New median: {s.median()}")
print(f"New std dev: {s.std()}")
print(f"New delta: {abs(s.mean() - s.median())}")

print(s.describe())


Mean: -0.09082507731206121
Median: -0.4146992783863419
Std dev: 100.01335047331702
Delta: 0.32387420107428067
count    100000.000000
mean         -0.090825
std         100.013350
min        -449.411704
25%         -67.292120
50%          -0.414699
75%          67.636542
max         473.195769
dtype: float64
New mean: -0.06267117182870444
New median: -0.4092886611480314
New std dev: 100.28277012615403
New delta: 0.346617489319327
count    100000.000000
mean         -0.062671
std         100.282770
min        -402.315865
25%         -67.288054
50%          -0.409289
75%          67.640758
max        2365.978844
dtype: float64


In [22]:
# Extension questions.
# 1. Demonstrate that 68%, 95%, and 99.7% of the values are within 1, 2, 3 std dev of the mean

g = np.random.default_rng(0)
# normal's args are (mean, std dev, number of values)
s = Series(g.normal(0, 100, 100_000))

dev_one = s.loc[(s <= s.mean() + s.std()) & (s >= s.mean() - s.std())]
dev_two = s.loc[(s <= s.mean() + s.std() * 2) & (s >= s.mean() - s.std() * 2)]
dev_three = s.loc[(s <= s.mean() + s.std() * 3) & (s >= s.mean() - s.std() * 3)]

print(f"Within 1 std dev: {len(dev_one) / len(s) * 100:.2f}")
print(f"Within 2 std dev: {len(dev_two) / len(s) * 100:.2f}")
print(f"Within 3 std dev: {len(dev_three) / len(s) * 100:.2f}")

Within 1 std dev: 68.40
Within 2 std dev: 95.46
Within 3 std dev: 99.71


In [26]:
# 2. Calculate the mean of the numbers below the overall mean
#    Calcualte the mean of the numbers above the overall mean
#    Is the mean of the two numbers the same as the overall mean?

lower_half = s.loc[s < s.mean()].mean()
upper_half = s.loc[s > s.mean()].mean()
print(f"Upper: {upper_half}")
print(f"Lower: {lower_half}")
print(f"Mean of means: {(upper_half + lower_half) / 2}")
print(f"Original mean: {s.mean()}")

Upper: 79.92646167808225
Lower: -79.66763213378562
Mean of means: 0.12941477214831565
Original mean: -0.09082507731206121


In [None]:
# 3. What is the mean of the numbers beyond 3 std dev?
# These are generally considered outliers, and so calculating
# the mean of these can give a sensor for how the data might
# be skewed.

outers = s.loc[(s < s.mean() - s.std() * 3) | (s > s.mean() + s.std() * 3)]
print(f"Mean out outer values: {outers.mean()}")

Mean out outer values: -11.606040282602287
