# Descriptive Statistics

In [1]:
!pip install scipy



In [2]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [4]:
x= [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]

print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [5]:
y, y_with_nan = np.array(x), np.array(x_with_nan)

print(y)
print(y_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]


In [6]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

print(z)
print(z_with_nan)

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


## Measure of Central Tendency

In [9]:
# Mean

In [10]:
mean_ = sum(x) / len(x)
mean_

8.7

In [13]:
mean_ = statistics.mean(x)
mean_

8.7

In [14]:
mean_ = statistics.mean(x_with_nan)
mean_

nan

In [15]:
sum(x_with_nan)

nan

In [16]:
mean_ = np.mean(y)
mean_

8.7

In [17]:
y.mean()

8.7

In [18]:
y_with_nan.mean()

nan

In [19]:
np.nanmean(y_with_nan)

8.7

In [20]:
# Weighted Mean

In [21]:
w = [0.1, 0.2, 0.3, 0.25, 0.15]

In [22]:
w = np.array(w)

In [23]:
np.average(y, weights=w)

6.95

In [None]:
# Median

In [24]:
x

[8.0, 1, 2.5, 4, 28.0]

In [28]:
x[:-1]

[8.0, 1, 2.5, 4]

In [30]:
statistics.median(x)

4

In [31]:
x

[8.0, 1, 2.5, 4, 28.0]

In [32]:
statistics.median(x[:-1])

3.25

In [33]:
statistics.median_low(x[:-1])

2.5

In [34]:
statistics.median_high(x[:-1])

4

In [35]:
statistics.median(x_with_nan)

6.0

In [36]:
# Numpy
np.median(y)

4.0

In [37]:
# Modus

In [38]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]

In [39]:
statistics.mode(u)

2

In [40]:
statistics.mode(v)

StatisticsError: no unique mode; found 2 equally common values

In [41]:
u, v = np.array(u), np.array(v)

In [42]:
scipy.stats.mode(u)

ModeResult(mode=array([2]), count=array([2]))

In [43]:
scipy.stats.mode(v)

ModeResult(mode=array([12]), count=array([3]))

## Measure of Variability

In [48]:
## Variance

In [45]:
#Stats
statistics.variance(x)

123.2

In [46]:
#Numpy
np.var(x, ddof=1)

123.19999999999999

In [47]:
# Pandas
z.var(ddof=1)

123.19999999999999

In [49]:
## Standard Deviation

In [50]:
#STatistics
statistics.stdev(x)

11.099549540409287

In [51]:
# Numpy
np.std(y, ddof=1)

11.099549540409285

In [52]:
# Pandas
z.std(ddof=1)

11.099549540409285

In [53]:
## Skewness

In [54]:
x

[8.0, 1, 2.5, 4, 28.0]

In [55]:
y

array([ 8. ,  1. ,  2.5,  4. , 28. ])

In [56]:
z

0     8.0
1     1.0
2     2.5
3     4.0
4    28.0
dtype: float64

In [57]:
scipy.stats.skew(y, bias=False)

1.9470432273905927

In [58]:
scipy.stats.skew(y_with_nan, bias=False)

nan

In [59]:
z.skew()

1.9470432273905924

In [60]:
z_with_nan.skew()

1.9470432273905924

In [61]:
# Percentiles

In [62]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 12.8, 21.0, 25.0, 41.0]

In [64]:
statistics.quantiles(x, n=2)

AttributeError: module 'statistics' has no attribute 'quantiles'

In [65]:
y = np.array(x)

In [66]:
np.percentile(y,5)

-3.245

In [67]:
np.percentile(y, 95)

33.79999999999998

In [68]:
np.percentile(y, [25, 50, 75])

array([ 0.575, 10.4  , 18.95 ])

In [69]:
np.percentile(y, 1)

-4.649

In [70]:
np.median(y)

10.4

In [71]:
z = pd.Series(y)

In [72]:
z.quantile(0.05)

-3.245

In [73]:
z.quantile(0.95)

33.79999999999998

In [74]:
z.quantile([0.25, 0.5, 0.75])

0.25     0.575
0.50    10.400
0.75    18.950
dtype: float64

In [75]:
z.quantile(-0.95)

ValueError: percentiles should all be in the interval [0, 1]. Try -0.0095 instead.

In [76]:
# Range

In [77]:
np.ptp(y)

46.0

In [78]:
np.ptp(z)

  return ptp(axis=axis, out=out, **kwargs)


46.0

In [79]:
np.ptp(x)

46.0

In [80]:
# Covariance

In [81]:
x = list(range(-10,11))

In [82]:
x

[-10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [83]:
y = [0,2,2,2,2,3,3,6,7,4,7,6,6,9,4,5,5,10,11,12,14]

In [84]:
x_, y_ = np.array(x), np.array(y)

In [85]:
x__, y__ = pd.Series(x_), pd.Series(y_)

In [86]:
np.cov(x_, y_)

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [87]:
x__.cov(y__)

19.95

In [88]:
np.corrcoef(x_,y_)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

In [89]:
np.corrcoef(x,y)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

In [90]:
np.corrcoef(x__,y__)

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

In [91]:
scipy.stats.pearsonr(x_,y_)

(0.8619500056316061, 5.122760847201132e-07)

In [92]:
x__.corr(y__)

0.8619500056316061