In [None]:
import os
import numpy as np
import pandas as pd
from scipy import stats
from plt_rce import *

## 정규분포를 따르는 무작위 값 생성

In [None]:
np.random.seed(1234)

In [None]:
heights = stats.norm.rvs(loc=175, scale=5, size=5000)

heights.mean()
# np.float64(175.12138663450918)
heights.std()
# np.float64(4.9588737381446055)

In [None]:
heights1 = stats.norm.rvs(loc=175, scale=5, size=500000)

heights1.mean()
# np.float64(175.0016360108597)
heights1.std()
# np.float64(4.9991504073228725)

## 부트스트래핑

In [None]:
np.random.seed(1234)

avgs = pd.Series()

for i in range(1000):
    smpl = np.random.choice(a=heights, size=100, replace=True)
    avgs.loc[i] = smpl.mean()

avgs.describe().round(2)
# count    1000.00
# mean      175.10
# std         0.49
# min       173.19
# 25%       174.76
# 50%       175.10
# 75%       175.45
# max       176.69
# dtype: float64

- 주사위를 굴려서 얻는 실현값은 이산균등분포를 따름
- 멀쩡한 주사위를 100번 굴려서 평균을 낸 표본 통계량의 기대값은 모평균인 3.5와 같고
- 표본 통계량 표준편차의 기대값은 모표준편차(약 1.7)을 표본 크기인 100의 제곱근(10)으로 나눈 값과 같음

In [None]:
np.random.seed(1234)

avgs = pd.Series()

for i in range(1000):
    smpl = np.random.choice(a=np.arange(1, 7), size=100, replace=True)
    avgs.loc[i] = smpl.mean()

avgs.describe()

In [None]:
dice = np.arange(1, 7)
dice

In [None]:
dice.mean()

In [None]:
dice.std()

## 정규분포 확률밀도

In [None]:
stats.norm.pdf(x=175, loc=175, scale=15)
# np.float64(0.02659615202676218)

stats.norm.pdf(x=175, loc=175, scale=10)
# np.float64(0.03989422804014327)

stats.norm.pdf(x=175, loc=175, scale=5)
# np.float64(0.07978845608028654)

In [None]:
stats.norm.pdf(x=[169, 173, 175, 178, 182], loc=175, scale=5)
# array([0.03883721, 0.07365403, 0.07978846, 0.06664492, 0.02994549])

stats.norm.pdf(x=[169, 173, 175, 178, 182], loc=175, scale=5).prod()
# np.float64(4.5549448486331744e-07)

np.log(stats.norm.pdf(x=[169, 173, 175, 178, 182], loc=175, scale=5)).sum()
# np.float64(-14.601882228193865)

In [None]:
np.log(stats.norm.pdf(x=[169, 173, 175, 178, 182], loc=185, scale=5)).sum()
# np.float64(-23.80188222819387)

## 확률밀도곡선 시각화

In [None]:
x = range(125, 226)
y1 = stats.norm.pdf(x=x, loc=175, scale=15)
y2 = stats.norm.pdf(x=x, loc=175, scale=10)
y3 = stats.norm.pdf(x=x, loc=175, scale=5)

sns.lineplot(x=x, y=y1, label='Scale: 15')
sns.lineplot(x=x, y=y2, label='Scale: 10')
sns.lineplot(x=x, y=y3, label='Scale: 5')

plt.legend()

plt.show()

## 누적확률

In [None]:
stats.norm.cdf(x=185, loc=175, scale=15)
# np.float64(0.7475074624530771)

stats.norm.cdf(x=185, loc=175, scale=10)
# np.float64(0.8413447460685429)

stats.norm.cdf(x=185, loc=175, scale=5)
# np.float64(0.9772498680518208)

In [None]:
cdfs = stats.norm.cdf(x=[165, 185], loc=175, scale=5)
np.diff(cdfs)
# array([0.95449974])

## 확률변수값

In [None]:
stats.norm.ppf(q=0.748, loc=175, scale=15)
# np.float64(185.02313949588586)

stats.norm.ppf(q=0.841, loc=175, scale=10)
# np.float64(184.9857627061566)

stats.norm.ppf(q=0.977, loc=175, scale=5)
# np.float64(184.9769665508391)

## 왜도와 첨도

In [None]:
stats.skew(heights)
# np.float64(-0.03668062034777025)

In [None]:
stats.kurtosis(heights)
# np.float64(-0.07067499523641407)

## 정규성 검정

In [None]:
# statistic = 검정통계량
# pvalue = 유의확률
stats.shapiro(heights)
# ShapiroResult(statistic=np.float64(0.9995954653825436), pvalue=np.float64(0.40487292793217))

In [None]:
np.random.seed(1234)
heights = stats.norm.rvs(loc=175, scale=5, size=10000)

stats.anderson(heights)
# AndersonResult(statistic=np.float64(0.35265258232175256), critical_values=array([0.576, 0.656, 0.787, 0.918, 1.092]), significance_level=array([15. , 10. ,  5. ,  2.5,  1. ]), fit_result=  params: FitParams(loc=np.float64(175.08063230023598), scale=np.float64(4.9761483416071846))
#  success: True
#  message: '`anderson` successfully fit the distribution to the data.')

## 표준화 함수 생성

In [None]:
def scale(x, loc, scale):
    return (x - loc) / scale

In [None]:
scale(x=185, loc=175, scale=15)
# 0.6666666666666666

scale(x=185, loc=175, scale=10)
# 1.0

scale(x=185, loc=175, scale=5)
# 2.0

scale(x=90, loc=75, scale=15)
# 1.0

scale(x=55, loc=40, scale=10)
# 1.5

## 이상치 탐지

In [None]:
scaled = scale(x=heights, loc=175, scale=5)
scaled
# array([ 0.47143516, -1.19097569,  1.43270697, ..., -1.05246069,
    #    -0.4976931 , -0.2560062 ], shape=(10000,))

In [None]:
cond = np.abs(scaled) > 3
cond
# array([False, False, False, ..., False, False, False], shape=(10000,))

In [None]:
outliers = heights[cond]
outliers
# array([157.1824167 , 190.62817574, 190.54817675, 155.59550794,
#        158.61347962, 191.10284213, 158.83247735, 157.82590423,
#        159.91806595, 159.24619206, 190.00573485, 158.67898086,
#        157.0030016 , 158.68108413, 159.25844267, 191.43893949,
#        158.34281695, 155.70665632, 190.18821417, 159.73908333])

In [None]:
locs = np.where(cond)
locs
# (array([  81, 1307, 1333, 1670, 2053, 2289, 3136, 3924, 4477, 4599, 4712,
#         5274, 5903, 7040, 7510, 7743, 8578, 8650, 9406, 9888]),)

In [None]:
plt.figure(figsize=(12, 4))

sns.scatterplot(
    x=range(10000), y=heights, fc='0.9', s=25
)
sns.scatterplot(
    x=locs[0], y=outliers, fc='red'
)
plt.axhline(
    y=175, linestyle='-', linewidth=1, color='0'
)
plt.axhline(
    y=160, linestyle='--', linewidth=1, color='red'
)
plt.axhline(
    y=190, linestyle='--', linewidth=1, color='red'
)

plt.show()