In [1]:
import math
import statistics
import numpy as np
import scipy.stats
import pandas as pd

In [2]:
x = [8.0, 1, 2.5, 4, 28.0]
x_with_nan = [8.0, 1, 2.5, math.nan, 4, 28.0]

print(x)
print(x_with_nan)

[8.0, 1, 2.5, 4, 28.0]
[8.0, 1, 2.5, nan, 4, 28.0]


In [4]:
y, y_with_nan = np.array(x), np.array(x_with_nan)
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

print(y)
print(y_with_nan)
print(z_with_nan)

[ 8.   1.   2.5  4.  28. ]
[ 8.   1.   2.5  nan  4.  28. ]
0     8.0
1     1.0
2     2.5
3     NaN
4     4.0
5    28.0
dtype: float64


### Mean

In [5]:
# manual
mean_ = sum(x) / len(x)
mean_

8.7

In [7]:
# menggunakan package statistics
mean_ = statistics.mean(x)
mean_

8.7

In [8]:
mean_ = statistics.mean(x_with_nan)
mean_

nan

In [9]:
# menggunakan numpy
mean_ = np.mean(y)
mean_

8.7

In [10]:
mean_ = y.mean()
mean_

8.7

In [11]:
np.nanmean(y_with_nan)

8.7

### Weighted Mean

In [13]:
x = [8.0, 1, 2.5, 4, 28.0]
w = [0.1, 0.2, 0.3, 0.25, 0.15]

# manual
wmean = sum(w[i] * x[i] for i in range(len(x))) / sum(w)
print(wmean)

wmean = sum(x_ * w_ for (x_, w_) in zip(z,w)) / sum(w)
print(wmean)

6.95
6.95


In [16]:
y, z, w = np.array(x), pd.Series(x), np.array(w)

wmean = np.average(y, weights=w)
print(wmean)

wmean = np.average(z, weights=w)
print(wmean)

6.95
6.95


### Harmonic Mean

In [17]:
# manual
hmean = len(x) / sum(1 /  item for item in x)
hmean

2.7613412228796843

In [18]:
hmean = statistics.harmonic_mean(x)
hmean

2.7613412228796843

In [19]:
scipy.stats.hmean(y)

2.7613412228796843

### Geometric Mean

In [20]:
gmean = 1

for item in x:
    gmean *= item

gmean **= 1 / len(x)
gmean

4.677885674856041

In [21]:
scipy.stats.gmean(y)

4.67788567485604

### Median

In [22]:
n = len(x)

if n%2:
    median_ = sorted(x)[round(0.5*(n-1))]
else:
    x_ord, index = sorted(x), round(0.5 * n)
    median_ = 0.5 * (x_ord[index-1] + x_ord[index])
    
median_

4

In [24]:
statistics.median_low(x[:-1])

2.5

In [25]:
statistics.median_high(x[:-1])

4

In [26]:
median_ = np.median(y)
median_

4.0

### Mode

In [27]:
u = [2, 3, 2, 8, 12]
v = [12, 15, 12, 15, 21, 15, 12]

mode_ = statistics.mode(u)
mode_

2

In [29]:
u, v = np.array(u), np.array(v)

mode_ = scipy.stats.mode(u)
mode_

ModeResult(mode=array([2]), count=array([2]))

In [30]:
u, v, w = pd.Series(u), pd.Series(v), pd.Series([2,2,math.nan])

print(u.mode())
print(v.mode())
print(w.mode())

0    2
dtype: int32
0    12
1    15
dtype: int32
0    2.0
dtype: float64


# Measures of Variability

### Variance

In [31]:
var_ = statistics.variance(x)
var_

123.2

In [32]:
var_ = np.var(y, ddof=1)
var_

123.19999999999999

In [33]:
var_ = y.var(ddof=1)
var_

123.19999999999999

### Standard Deviation

In [34]:
std_ = statistics.stdev(x)
std_

11.099549540409287

In [35]:
np.std(y, ddof=1)

11.099549540409285

In [36]:
y.std(ddof=1)

11.099549540409285

### skewness

In [37]:
scipy.stats.skew(y, bias=False)

1.9470432273905927

In [38]:
scipy.stats.skew(y_with_nan, bias=False)

nan

In [39]:
z, z_with_nan = pd.Series(x), pd.Series(x_with_nan)

print(z.skew())
print(z_with_nan.skew())

1.9470432273905924
1.9470432273905924


### Percentile

In [41]:
x = [-5.0, -1.1, 0.1, 2.0, 8.0, 12.8, 21.0, 25.8, 41.0]

statistics.quantiles(x, n=2)

[8.0]

In [42]:
statistics.quantiles(x, n=4, method='inclusive')

[0.1, 8.0, 21.0]

In [43]:
y = np.array(x)
np.percentile(y, 5)

-3.44

In [44]:
np.percentile(y, 95)

34.919999999999995

In [46]:
print(np.percentile(y, [25,50,75]))
print(np.median(y))

[ 0.1  8.  21. ]
8.0


### Ranges

In [47]:
np.ptp(y)

46.0

In [48]:
np.ptp(y_with_nan)

nan

In [50]:
result = z.describe()
result

count     5.00000
mean      8.70000
std      11.09955
min       1.00000
25%       2.50000
50%       4.00000
75%       8.00000
max      28.00000
dtype: float64

# Measures of Correlation Between Pairs of Data

In [52]:
x = list(range(-10,11))
y = [0, 2, 2, 2, 2, 3, 3, 6, 7, 4, 7, 6, 6, 9, 4, 5, 5, 10, 11, 12, 14]

x_, y_ = np.array(x), np.array(y)
x__ , y__ = pd.Series(x_), pd.Series(y_)

### Covariance

In [53]:
cov_matrix = np.cov(x_, y_)
cov_matrix

array([[38.5       , 19.95      ],
       [19.95      , 13.91428571]])

In [54]:
cov_xy = cov_matrix[0,1]
cov_xy

19.95

In [55]:
cov_xy = cov_matrix[1,0]
cov_xy

19.95

In [56]:
cov_xy = x__.cov(y__)
cov_xy

19.95

In [57]:
cov_xy = y__.cov(x__)
cov_xy

19.95

### Correlative Coefficient

In [58]:
r, p = scipy.stats.pearsonr(x_, y_)
print(r)
print(p)

0.861950005631606
5.122760847201171e-07


In [59]:
corr_matrix = np.corrcoef(x_, y_)
corr_matrix

array([[1.        , 0.86195001],
       [0.86195001, 1.        ]])

# Working With 2D data

In [60]:
a = np.array([[1,1,1],
            [2,3,1],
            [4,9,2],
            [8,27,4],
            [16,1,1]])

print(np.mean(a))
print(a.mean())
print(np.median(a))
print(a.var(ddof=1))

5.4
5.4
2.0
53.40000000000001


In [62]:
np.mean(a, axis=0) # mean perbaris array

array([6.2, 8.2, 1.8])

In [63]:
np.mean(a, axis=1) # mean perkolom array

array([ 1.,  2.,  5., 13.,  6.])

In [64]:
scipy.stats.gmean(a)

array([4.        , 3.73719282, 1.51571657])

In [65]:
scipy.stats.gmean(a, axis=1)

array([1.        , 1.81712059, 4.16016765, 9.52440631, 2.5198421 ])

### Data Frames

In [66]:
row_names = ['first', 'second', 'third', 'fourth', 'fifth']
col_names = ['A', 'B', 'C']
df = pd.DataFrame(a, index = row_names, columns=col_names)
df

Unnamed: 0,A,B,C
first,1,1,1
second,2,3,1
third,4,9,2
fourth,8,27,4
fifth,16,1,1


In [68]:
print(df.mean())
print('\n')
print(df.var())

A    6.2
B    8.2
C    1.8
dtype: float64


A     37.2
B    121.2
C      1.7
dtype: float64


In [70]:
print(df.mean(axis=1))
print('\n')
print(df.var(axis=1))

first      1.0
second     2.0
third      5.0
fourth    13.0
fifth      6.0
dtype: float64


first       0.0
second      1.0
third      13.0
fourth    151.0
fifth      75.0
dtype: float64
