In [1]:
import numpy as np
import pandas as pd

from scipy import stats

In [2]:
fish_length = np.array([2,3,3,4,4,4,4,5,5,6])
fish_length

array([2, 3, 3, 4, 4, 4, 4, 5, 5, 6])

In [3]:
fish_length_df = pd.DataFrame({
    'length': fish_length,
})
fish_length_df

Unnamed: 0,length
0,2
1,3
2,3
3,4
4,4
5,4
6,4
7,5
8,5
9,6


In [5]:
sample_size = len(fish_length_df)
sample_size

10

In [6]:
df_as_array = fish_length_df.to_numpy()
np.sum(df_as_array)

40

In [7]:
sample_mean = np.sum(df_as_array) / sample_size
sample_mean

4.0

In [11]:
sample_variance = np.sum((df_as_array - sample_mean) ** 2) / sample_size
print(sample_variance)

1.2


In [12]:
s2 = np.var(df_as_array, ddof=0)
s2

1.2

In [13]:
unbiased_variance = sample_variance = np.sum((df_as_array - sample_mean) ** 2) / (sample_size - 1)
unbiased_variance

1.3333333333333333

In [14]:
round(unbiased_variance, 3) # u^2 as u2

1.333

In [15]:
round(np.var(df_as_array, ddof=1), 3)

1.333

In [19]:
standard_deviation = np.sqrt(s2)
round(standard_deviation, 3)

1.095

In [18]:
round(np.std(df_as_array, ddof=0), 3)

0.0

In [21]:
# Coefficient of Variation => 변동계수
# CV = s(standard deviation) / x (sample mean)
cv = sample_variance / sample_mean
round(cv, 3)

0.333

In [24]:
round(stats.variation(df_as_array)[0], 3) # get CV of length of fishes

0.274

In [26]:
round(stats.variation(df_as_array, ddof=1)[0], 3)

0.289

In [27]:
x_bar = np.mean(df_as_array)
x_bar

4.0

In [30]:
z = (df_as_array - x_bar) / standard_deviation
np.round(z, 3)

array([[-1.826],
       [-0.913],
       [-0.913],
       [ 0.   ],
       [ 0.   ],
       [ 0.   ],
       [ 0.   ],
       [ 0.913],
       [ 0.913],
       [ 1.826]])

In [31]:
round(np.mean(z), 3)

0.0

In [32]:
np.std(z, ddof=0)

1.0

### 정규화(Standardization or z-score normalization)

$$ z_i = (x_i - mean) / standardDeviation $$

mean = 0.0   
variance = 1.0

In [33]:
np.round(stats.zscore(df_as_array, ddof=0), 3)

array([[-1.826],
       [-0.913],
       [-0.913],
       [ 0.   ],
       [ 0.   ],
       [ 0.   ],
       [ 0.   ],
       [ 0.913],
       [ 0.913],
       [ 1.826]])

In [34]:
np.amin(df_as_array)

2

In [35]:
np.amax(df_as_array)

6

In [36]:
np.median(df_as_array)

4.0

In [37]:
fish_length2 = np.array([2,3,3,4,4,4,4,5,5,100]) # outlier is 100

print(f"mean: {np.mean(fish_length2)}") 
print(f"median: {np.median(fish_length2)}") # 4분위수의 값은 크게 변하지 않음

mean: 13.4
median: 4.0


In [38]:
print(f'제 1 사분위 수: {np.quantile(fish_length2, q=0.25)}')
print(f'제 3 사분위 수: {np.quantile(fish_length2, q=0.75)}')

제 1 사분위 수: 3.25
제 3 사분위 수: 4.75


In [39]:
fish_length3 = np.arange(0, 101, 1)
fish_length3

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100])

In [40]:
print(f'제 1 사분위 수: {np.quantile(fish_length3, q=0.25)}')
print(f'제 3 사분위 수: {np.quantile(fish_length3, q=0.75)}')

제 1 사분위 수: 25.0
제 3 사분위 수: 75.0


In [41]:
stats.mode(fish_length)

ModeResult(mode=4, count=4)

In [42]:
print(fish_length_df.describe())

          length
count  10.000000
mean    4.000000
std     1.154701
min     2.000000
25%     3.250000
50%     4.000000
75%     4.750000
max     6.000000
