# Descriptive Statistics

In [1]:
import math
import statistics

import numpy as np
import pandas as pd
import scipy.stats

In [2]:
x = [8., 1, 2.5, 4, 28.]
x_with_nan = [8., 1, 2.5, math.nan, 4, 28.]

print("x:", x)
print("x with nan:", x_with_nan)

np_x, np_x_with_nan = np.array(x), np.array(x_with_nan)

print("array x:", np_x)
print("array x with nan:", np_x_with_nan)

pd_x, pd_x_with_nan = pd.Series(x), pd.Series(x_with_nan)

print("series x:", pd_x, sep="\n")
print("series x with nan:", pd_x_with_nan, sep="\n")

x: [8.0, 1, 2.5, 4, 28.0]
x with nan: [8.0, 1, 2.5, nan, 4, 28.0]
array x: [ 8.   1.   2.5  4.  28. ]
array x with nan: [ 8.   1.   2.5  nan  4.  28. ]
series x:
0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
series x with nan:
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


## Arithmetic Mean

In [3]:
# native python -> no library
mean_x = sum(x) / len(x)
print("mean x:", mean_x)

mean x: 8.7


In [4]:
# native python
mean_x_with_nan = sum(x_with_nan) / len(x_with_nan)
print("mean x with nan:", mean_x_with_nan)

mean x with nan: nan


In [5]:
# use `statistics` library
stats_mean_x = statistics.mean(x)
print("mean x with statistics:", stats_mean_x)

mean x with statistics: 8.7


In [6]:
# use `statistics` library
stats_mean_x_with_nan = statistics.mean(x_with_nan)
print("mean x with nan with statistics:", stats_mean_x_with_nan)

mean x with nan with statistics: nan


In [7]:
# use `numpy` library
np_mean_x = np.mean(x)
print("mean x with numpy:", np_mean_x)

np_mean_np_x = np_x.mean()
print("mean array x with numpy:", np_mean_np_x)

np_mean_x_with_nan = np.mean(x_with_nan)
print("mean x with nan with numpy:", np_mean_x_with_nan)

np_nanmean_x_with_nan = np.nanmean(x_with_nan)
print("nanmean x with nan with numpy:", np_nanmean_x_with_nan)

mean x with numpy: 8.7
mean array x with numpy: 8.7
mean x with nan with numpy: nan
nanmean x with nan with numpy: 8.7


In [8]:
# use `Series.mean` from pandas
pd_mean_x = pd_x.mean()
print("mean x with series:", pd_mean_x)

pd_mean_x_with_nan = pd_x_with_nan.mean()
print("mean x with nan with series:", pd_mean_x_with_nan)

mean x with series: 8.7
mean x with nan with series: 8.7


In [9]:
pd_x_with_nan.mean(skipna=False)

nan

## Weighted Average

In [10]:
print(x)

[8.0, 1, 2.5, 4, 28.0]


In [11]:
weights = [.1, .2, .3, .25, .15]
print("weights:", weights, "sum:", sum(weights))

weights: [0.1, 0.2, 0.3, 0.25, 0.15] sum: 1.0


In [12]:
# native python
weighted_mean_x = sum([
    weights[idx] * x[idx] for idx in range(len(weights))
]) / sum(weights)
print("weighted mean x:", weighted_mean_x)

weighted mean x: 6.95


In [13]:
print(x_with_nan)

[8.0, 1, 2.5, nan, 4, 28.0]


In [14]:
weights_with_nan = [.1, .2, .3, .1, .15, .15]

In [15]:
# use `numpy`
np_weighted_mean_x = np.average(x, weights=weights)
print("weighted mean x with numpy:", np_weighted_mean_x)

np_weighted_mean_x_with_nan = np.average(x_with_nan, weights=weights_with_nan)
print("weighted mean x with nan with numpy:", np_weighted_mean_x_with_nan)

weighted mean x with numpy: 6.95
weighted mean x with nan with numpy: nan


## Harmonic Mean

In [16]:
print(x)

[8.0, 1, 2.5, 4, 28.0]


In [17]:
# reciprocal of data
[1/item for item in x]

[0.125, 1.0, 0.4, 0.25, 0.03571428571428571]

In [18]:
# native python
harmonic_mean_x = len(x) / sum([1/item for item in x])
print("harmonic mean x:", harmonic_mean_x)

harmonic mean x: 2.7613412228796843


In [19]:
# use `statistics` library
stats_harmonic_mean_x = statistics.harmonic_mean(x)
print("harmonic mean x with statistics:", stats_harmonic_mean_x)

harmonic mean x with statistics: 2.7613412228796843


In [20]:
# scipy
scipy_harmonic_mean_x = scipy.stats.hmean(x)
print("harmonic mean x with scipy:", scipy_harmonic_mean_x)

harmonic mean x with scipy: 2.7613412228796843


## Geometric Mean

In [21]:
print(x)

[8.0, 1, 2.5, 4, 28.0]


In [22]:
# native python
geo_mean_x = 1
for data in x:
    geo_mean_x = geo_mean_x * data
geo_mean_x = geo_mean_x ** (1/len(x))
print("geometric mean x:", geo_mean_x)

geometric mean x: 4.677885674856041


In [23]:
# use `scipy` library
scipy_geo_mean_x = scipy.stats.gmean(x)
print("geometric mean x with scipy:", scipy_geo_mean_x)

geometric mean x with scipy: 4.67788567485604


## Median

In [24]:
print(x)

[8.0, 1, 2.5, 4, 28.0]


In [25]:
sorted(x)

[1, 2.5, 4, 8.0, 28.0]

In [26]:
len(x)

5

In [27]:
# native python
x = [8.0, 1, 2.5, 4, 28.0]
print(x, sorted(x))

n = len(x)
# odd `n`
if n % 2 != 0:
    median_x = sorted(x)[round(.5 * n-1)]
# even `n`
else:
    med_left = sorted(x)[round(.5 * n)-1]
    med_right = sorted(x)[round(.5 * n)]
    median_x = .5 * (med_left + med_right)

print("median x:", median_x)

[8.0, 1, 2.5, 4, 28.0] [1, 2.5, 4, 8.0, 28.0]
median x: 4


In [28]:
# use `statistics` library
x = [8.0, 1, 2.5, 4, 28.0]
print("x:", x)
stats_median_x = statistics.median(x)
print("median x with statistics:", stats_median_x)

x = [8.0, 1, 2.5, 4]
print("x:", x)
stats_median_x = statistics.median(x)
stats_median_left_x = statistics.median_low(x)
stats_median_right_x = statistics.median_high(x)
print("median left x:", stats_median_left_x, "median right:", stats_median_right_x)
print("median x with statistics:", stats_median_x)

x: [8.0, 1, 2.5, 4, 28.0]
median x with statistics: 4
x: [8.0, 1, 2.5, 4]
median left x: 2.5 median right: 4
median x with statistics: 3.25


In [29]:
x = [8.0, 1, 2.5, 4, 28.0]
np_x = np.array(x)
pd_x = pd.Series(x)

# with numpy array
np_median_x = np.median(x)
# with pandas series
pd_median_x = pd_x.median()
print("median x with numpy:", np_median_x)
print("median x with series:", pd_median_x)

median x with numpy: 4.0
median x with series: 4.0


In [30]:
x = [8.0, 1, 2.5, 4]
np_x = np.array(x)
pd_x = pd.Series(x)

# with numpy array
np_median_x = np.median(x)
# with pandas series
pd_median_x = pd_x.median()
print("median x with numpy:", np_median_x)
print("median x with series:", pd_median_x)

median x with numpy: 3.25
median x with series: 3.25


## Mode

In [31]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]

print("u:", u)
print("v:", v)

u: [2, 3, 2, 8, 12]
v: [12, 15, 12, 15, 21, 15, 12]


In [32]:
# how many 2s in u?
u.count(2)

2

In [33]:
# how many 15s in v?
v.count(15)

3

In [34]:
# native python
mode_u = max([(u.count(data), data) for data in set(u)])[1]
mode_v = max([(v.count(data), data) for data in set(v)])[1]

print("mode u:", mode_u)
print("mode v:", mode_v)

mode u: 2
mode v: 15


In [35]:
# use `statistics` library
stats_mode_u = statistics.mode(u)
stats_mode_v = statistics.mode(v)

print("mode u with statistics:", stats_mode_u)
print("mode v with statistics:", stats_mode_v)

mode u with statistics: 2
mode v with statistics: 12


In [36]:
# pandas series
pd_u, pd_v = pd.Series(u), pd.Series(v)

print(pd_u.mode())
print(pd_v.mode())

0    2
dtype: int64
0    12
1    15
dtype: int64


## Variance

In [37]:
x = [8.0, 1, 2.5, 4, 28.0]
np_x = np.array(x)
print(x)

[8.0, 1, 2.5, 4, 28.0]


In [38]:
# native python
mean_x = sum(x) / len(x)

var_x = sum([(data - mean_x)**2 for data in x]) / (len(x)-1)
print("variance x:", var_x)

variance x: 123.19999999999999


In [39]:
# statistics
stats_var_x = statistics.variance(x)
print("variance x with statistics:", stats_var_x)

# numpy ddof=1 for sample data
np_var_x = np.var(x, ddof=1)
np_var_arr_x = np_x.var(ddof=1)
print("variance x with numpy:", np_var_x)
print("variance array x with numpy:", np_var_arr_x)

variance x with statistics: 123.2
variance x with numpy: 123.19999999999999
variance array x with numpy: 123.19999999999999


## Standard Deviation

In [40]:
# native python
std_x = var_x ** .5
print("std x:", std_x)

std x: 11.099549540409285


In [41]:
# statistics
stats_std_x = statistics.stdev(x)
print("std x with statistics:", stats_std_x)

# numpy
np_std_x = np.std(x, ddof=1)
np_std_arr_x = np_x.std(ddof=1)
print("std x with numpy:", np_std_x)
print("std array x with numpy:", np_std_arr_x)

# pandas series
pd_x = pd.Series(x)
pd_std_x = pd_x.std()
print("std x with series:", pd_std_x)

std x with statistics: 11.099549540409287
std x with numpy: 11.099549540409285
std array x with numpy: 11.099549540409285
std x with series: 11.099549540409285


## Skewness

In [42]:
# scipy
scipy_skew_x = scipy.stats.skew(x, bias=False)
print("skewness x with scipy:", scipy_skew_x)

# pandas series
pd_skew_x = pd_x.skew()
print("skewness x with series:", pd_skew_x)

skewness x with scipy: 1.9470432273905927
skewness x with series: 1.9470432273905924


## Quantiles, Percentiles, Quartiles

In [43]:
print(x, np_x, sorted(x))

[8.0, 1, 2.5, 4, 28.0] [ 8.   1.   2.5  4.  28. ] [1, 2.5, 4, 8.0, 28.0]


In [44]:
# statistics
print(statistics.quantiles(x, n=4, method="inclusive"))
print(statistics.quantiles(x, n=100, method="inclusive"))

# numpy
print(np.quantile(x, .5))
print(np.quantile(x, [.25, .5, .75]))

[2.5, 4.0, 8.0]
[1.06, 1.12, 1.18, 1.24, 1.3, 1.36, 1.42, 1.48, 1.54, 1.6, 1.66, 1.72, 1.78, 1.84, 1.9, 1.96, 2.02, 2.08, 2.14, 2.2, 2.26, 2.32, 2.38, 2.44, 2.5, 2.56, 2.62, 2.68, 2.74, 2.8, 2.86, 2.92, 2.98, 3.04, 3.1, 3.16, 3.22, 3.28, 3.34, 3.4, 3.46, 3.52, 3.58, 3.64, 3.7, 3.76, 3.82, 3.88, 3.94, 4.0, 4.16, 4.32, 4.48, 4.64, 4.8, 4.96, 5.12, 5.28, 5.44, 5.6, 5.76, 5.92, 6.08, 6.24, 6.4, 6.56, 6.72, 6.88, 7.04, 7.2, 7.36, 7.52, 7.68, 7.84, 8.0, 8.8, 9.6, 10.4, 11.2, 12.0, 12.8, 13.6, 14.4, 15.2, 16.0, 16.8, 17.6, 18.4, 19.2, 20.0, 20.8, 21.6, 22.4, 23.2, 24.0, 24.8, 25.6, 26.4, 27.2]
4.0
[2.5 4.  8. ]


## Range

In [45]:
# native
max_data, min_data = max(x), min(x)
print("range x:", max_data - min_data)

range x: 27.0


In [46]:
# numpy
np.ptp(x)

27.0

## Inter-quartile Range (IQR)

In [47]:
# series pandas
quartiles = pd_x.quantile([.25, .75])
IQR = quartiles[.75] - quartiles[.25]
print("Q1:", quartiles[.25])
print("Q3:", quartiles[.75])
print("IQR:", IQR)

Q1: 2.5
Q3: 8.0
IQR: 5.5


In [48]:
pd_x

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [49]:
# numpy
np_quartiles = np.quantile(x, [.25, .75])
IQR = np_quartiles[1] - np_quartiles[0]
print("Q1:", np_quartiles[0])
print("Q3:", np_quartiles[1])
print("IQR:", IQR)

Q1: 2.5
Q3: 8.0
IQR: 5.5


## Covariance

In [50]:
x = list(range(-10, 11))
y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]

print(len(x), len(y))

21 21


In [51]:
np_x, np_y = np.array(x), np.array(y)
pd_x, pd_y = pd.Series(x), pd.Series(y)

In [52]:
# native python
n = len(x)
mean_x, mean_y = sum(x) / len(x), sum(y) / len(y)
cov_xy = sum([(x[idx]-mean_x) * (y[idx]-mean_y) for idx in range(n)]) / (n-1)

print("covariance x-y:", cov_xy)

covariance x-y: 19.95


In [53]:
# numpy
np_cov_xy = np.cov(x, y, ddof=1)
print(np_cov_xy)

# pandas
pd_cov_xy = pd_x.cov(pd_y)
print(pd_cov_xy)

[[38.5        19.95      ]
 [19.95       13.91428571]]
19.95


## Correlation

In [54]:
np_cov_xy

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [55]:
# native python
cov_x, cov_y = np_cov_xy[0, 0], np_cov_xy[1, 1]
std_x, std_y = cov_x**.5, cov_y**.5

corr_xy = cov_xy / (std_x*std_y)
corr_xy

0.861950005631606

In [56]:
# scipy
r, p = scipy.stats.pearsonr(x, y)
print(r, p)

# numpy
np_corr_xy = np.corrcoef(x,  y)
print(np_corr_xy)

0.8619500056316061 5.122760847201135e-07
[[1.         0.86195001]
 [0.86195001 1.        ]]
