In [1]:
import math
import statistics

import numpy as np
import pandas as pd
import scipy.stats

In [2]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.]

x_arr = np.array(x)
x_arr_with_nan = np.array(x_with_nan)

x_series = pd.Series(x)
x_series_with_nan = pd.Series(x_with_nan)

print(x)
print(x_with_nan)

print(x_arr)
print(x_arr_with_nan)

print(x_series)
print(x_series_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]
[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


In [3]:
native_mean = sum(x) / len(x)
stats_mean = statistics.mean(x)
np_mean = np.mean(x)
pd_mean = x_series.mean()

print("native mean:", native_mean)
print("stats mean:", stats_mean)
print("numpy mean:", np_mean)
print("pandas mean:", pd_mean)
print("np array mean:", x_arr.mean())

native mean: 8.7
stats mean: 8.7
numpy mean: 8.7
pandas mean: 8.7
np array mean: 8.7


In [4]:
print("mean of x with nan:", sum(x_with_nan) / len(x_with_nan))
print("stats mean of x with nan:", statistics.mean(x_with_nan))
print("numpy mean of x with nan:", np.mean(x_with_nan))

np_mean_with_nan = np.nanmean(x_with_nan)
pd_mean_with_nan = x_series_with_nan.mean()

print("np array mean of x with nan:", x_arr.mean())
print("numpy mean nan:", np_mean_with_nan)
print(
    "pandas mean nan:", pd_mean_with_nan,
    "pandas mean skipna=False:", x_series_with_nan.mean(skipna=False)
)

mean of x with nan: nan
stats mean of x with nan: nan
numpy mean of x with nan: nan
np array mean of x with nan: 8.7
numpy mean nan: 8.7
pandas mean nan: 8.7 pandas mean skipna=False: nan


In [5]:
bool(np.nan), bool("")

(True, False)

In [6]:
x_new = [2, 4, 8]
weight = [.2, .5, .3]

native_weighted_mean = sum(w * x_item for w, x_item in zip(x_new, weight)) / sum(weight)

print("native weighted mean:", native_weighted_mean)
print(
    sum([2, 2, 4, 4, 4, 4, 4, 8, 8, 8]) / len([2, 2, 4, 4, 4, 4, 4, 8, 8, 8])
)

native weighted mean: 4.8
4.8


**ekuivalen:**
$ (2 \cdot 2 / 10 + 4 \cdot 5 / 10) / (2/10 + 5/10)$

In [7]:
x_new = [8.0, 1, 2.5, 4, 28.0]
weights = [.1, .2, .3, .25, .15]
x_new_arr = np.array(x_new)
weights_arr = np.array(weights)

sum_of_weighted_x = sum(weight * item for weight, item in zip(weights, x_new))
sum_weight = sum(weights)

print("weighted mean:", sum_of_weighted_x / sum_weight)

np_weighted_mean = np.average(x_new, weights=weights)
arr_weighted_mean = (x_new_arr * weights_arr).sum() / weights_arr.sum()

print("numpy weighted mean:", np_weighted_mean)
print("numpy array weighted mean:", arr_weighted_mean)

weighted mean: 6.95
numpy weighted mean: 6.95
numpy array weighted mean: 6.95


In [8]:
native_hmean = len(x) / sum(1/value for value in x)
stats_hmean = statistics.harmonic_mean(x)
scipy_hmean = scipy.stats.hmean(x)
# arr_hmean = x_arr.shape[0] / (1 / x_arr).sum()
arr_hmean = 1 / (1 / x_arr).mean()

print("native harmonic mean:", native_hmean)
print("stats harmonic mean:", stats_hmean)
print("scipy harmnonic mean:", scipy_hmean)
print("array (workaround) harmnonic mean:", arr_hmean)

native harmonic mean: 2.7613412228796843
stats harmonic mean: 2.7613412228796843
scipy harmnonic mean: 2.7613412228796843
array (workaround) harmnonic mean: 2.7613412228796843


In [9]:
print("x:", x)
multiplication = 1
for value in x:
    multiplication *= value
gmean = multiplication ** (1/len(x))

print("native gmean:", gmean)

x: [8.0, 1, 2.5, 4, 28.0]
native gmean: 4.677885674856041


In [10]:
scipy_gmean = scipy.stats.gmean(x)
stats_gmean = statistics.geometric_mean(x)   # python version >= 3.8

print("scipy gmean:", scipy_gmean)
print("stats gmean:", stats_gmean)

scipy gmean: 4.67788567485604
stats gmean: 4.67788567485604


In [11]:
print("x:", x)

n = len(x)
sorted_x = sorted(x)
if n % 2:    #  odd sequence
    median = sorted_x[int(.5 * n-1)]
else:    # even sequence
    med_index = int(.5 * n)
    median = (sorted_x[med_index-1] + sorted_x[med_index]) / 2

print("native median:", median)

x: [8.0, 1, 2.5, 4, 28.0]
native median: 2.5


In [12]:
x_new = [19, 3, 2, 2.0, .3, 5, 1, 6, 5, 1]
sorted_x_new = sorted(x_new)
print("x:", x_new, "sorted x:", sorted_x_new)

n_new = len(x_new)
if n_new % 2:    #  odd sequence
    median = sorted_x_new[int(.5 * n_new-1)]
else:    # even sequence
    med_index = int(.5 * n_new)
    median = (sorted_x_new[med_index-1] + sorted_x_new[med_index]) / 2

print("native median:", median)

x: [19, 3, 2, 2.0, 0.3, 5, 1, 6, 5, 1] sorted x: [0.3, 1, 1, 2, 2.0, 3, 5, 5, 6, 19]
native median: 2.5


In [13]:
np_median = np.median(x_new)
stats_median = statistics.median(x_new)
stats_median_low = statistics.median_low(x_new)
stats_median_high = statistics.median_high(x_new)

print("numpy median:", np_median)
print("stats median:", stats_median)
print("stats median low:", stats_median_low)
print("stats median high:", stats_median_high)

numpy median: 2.5
stats median: 2.5
stats median low: 2.0
stats median high: 3


In [14]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]
print("u:", u, "v:", v)

mode_u = max([(u.count(value), value) for value in u])[1]
mode_v = max([(v.count(value), value) for value in v])[1]
print([(v.count(value), value) for value in v])

print("native mode u:", mode_u)
print("native mode v:", mode_v)

u: [2, 3, 2, 8, 12] v: [12, 15, 12, 15, 21, 15, 12]
[(3, 12), (3, 15), (3, 12), (3, 15), (1, 21), (3, 15), (3, 12)]
native mode u: 2
native mode v: 15


In [15]:
u_series = pd.Series(u)
v_series = pd.Series(v)

stats_mode = statistics.mode(v)
scipy_mode = scipy.stats.mode(v)
series_mode = v_series.mode()

print("stats mode:", stats_mode)
print("scipy mode:", scipy_mode, scipy_mode.mode[0], scipy_mode.count[0])
print("series mode:", series_mode)

stats mode: 12
scipy mode: ModeResult(mode=array([12]), count=array([3])) 12 3
series mode: 0    12
1    15
dtype: int64


In [16]:
print("x:", x)
mean_x = sum(x) / len(x)
variance = sum((value - mean_x)**2 for value in x) / (len(x)-1)

print("native variance:", variance)

x: [8.0, 1, 2.5, 4, 28.0]
native variance: 123.19999999999999


In [17]:
stats_var = statistics.variance(x)
np_var = np.var(x, ddof=1)
series_var = x_series.var(ddof=1)

print("stats var:", stats_var)
print("numpy var:", np_var)
print("series var:", series_var)
print("scipy sample variance:", scipy.stats.tvar(x))

stats var: 123.2
numpy var: 123.19999999999999
series var: 123.19999999999999
scipy sample variance: 123.19999999999999


In [18]:
print("x:", x, "mean x:", mean_x)
native_std = variance ** .5
np_std = np.std(x, ddof=1)
series_std = x_series.std(ddof=1)

print("native std:", native_std)
print("numpy std:", np_std)
print("series std:", series_std)

x: [8.0, 1, 2.5, 4, 28.0] mean x: 8.7
native std: 11.099549540409285
numpy std: 11.099549540409285
series std: 11.099549540409285


In [19]:
scipy_skew = scipy.stats.skew(x, bias=False)
series_skew = x_series.skew()

print("scipy skew:", scipy_skew)
print("series skew:", series_skew)

scipy skew: 1.9470432273905927
series skew: 1.9470432273905924


In [20]:
x = [-5., -1.1, .1, 2., 8., 12.8, 21., 25.8, 41.]
print(x)

print(statistics.quantiles(x, method="inclusive"))
print(statistics.quantiles(x, method="exclusive"))
print(np.percentile(x, [0, 25, 50, 75, 100]))

[-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]
[0.1, 8.0, 21.0]
[-0.5, 8.0, 23.4]
[-5.   0.1  8.  21.  41. ]


In [21]:
print(np.ptp(x))
print(max(x) - min(x))

46.0
46.0


In [22]:
print(np.percentile(x, 75) - np.percentile(x, 25))
print(statistics.quantiles(x, method="inclusive")[-1] - statistics.quantiles(x, method="inclusive")[0])

20.9
20.9


In [23]:
x = list(range(-10, 11))
y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]
x_arr = np.array(x)
y_arr = np.array(y)
x_series = pd.Series(x)
y_series = pd.Series(y)

In [24]:
n = len(x)
mean_x, mean_y = sum(x) / n, sum(y) / n

cov_xy = sum((val_x - mean_x)*(val_y - mean_y) for val_x, val_y in zip(x, y)) / (n - 1)
print(cov_xy)

19.95


In [25]:
np_cov = np.cov(x, y)
print(np_cov)

[[38.5        19.95      ]
 [19.95       13.91428571]]


In [26]:
var_x, var_y = np.var(x, ddof=1), np.var(y, ddof=1)
print(var_x)
print(var_y)

38.5
13.914285714285711


In [27]:
corr_xy = np_cov[0, 1] / (var_x**.5 * var_y**.5)
print(corr_xy)

0.861950005631606


In [28]:
print("x:", x)
print("y:", y)

x: [-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y: [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]


In [29]:
z = list(range(20, -1, -1))
print("z:", z, len(z))
cov_xz = np.cov(x, z)
corr_xz = cov_xz[0, 1] / (cov_xz[0, 0]**.5 * cov_xz[1, 1]**.5)

print(corr_xz)

z: [20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] 21
-1.0
