# 산포통계

In [40]:
import numpy as np
from scipy import stats
import pandas as pd

## 분산,표준편차 구하기

### 분산 계산

In [9]:
x=[1,2,3,4,5]
# print(np.var(x)) # 모집단의 분산, ddof=0
print(np.var(x, ddof=1))
print(np.array(x).var())
print(pd.Series(x).var(ddof=0))

2.5
2.0
2.0


### 표준편차 계산

In [10]:
x=[1,2,3,4,5]
print(np.std(x,ddof=1))
print(np.array(x).std(ddof=0))
print(pd.Series(x).std(ddof=1))

1.5811388300841898
1.4142135623730951
1.5811388300841898


### 변동계수의 필요성

In [11]:
x1=np.array([1,2,3,4,5])
x2=x1*10

print(np.std(x1, ddof=1))
print(np.std(x2, ddof=1))

1.5811388300841898
15.811388300841896


In [12]:
print(stats.variation(x1))
print(stats.variation(x2))

0.47140452079103173
0.4714045207910317


In [27]:
print(np.std(x1,ddof=1)/np.mean(x1))
print(np.std(x2,ddof=1)/np.mean(x2))

0.5270462766947299
0.5270462766947299


### 스케일링

In [14]:
import numpy as np
import pandas as pd

In [15]:
x1=np.array([1,2,3,4,5])
x2=x1*10

In [16]:
x1

array([1, 2, 3, 4, 5])

In [17]:
x2

array([10, 20, 30, 40, 50])

In [21]:
z1=(x1-x1.mean())/x1.std()
z2=(x2-x2.mean())/x2.std()
print(z1)
print(z2)

[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]


In [23]:
z1=(x1-x1.min())/(x1.max()-x1.min())
z2=(x2-x2.min())/(x2.max()-x2.min())
print(z1)
print(z2)

[0.   0.25 0.5  0.75 1.  ]
[0.   0.25 0.5  0.75 1.  ]


## 데이터 표준화 하기

### 데이터 프레임 만들기

In [15]:
import pandas as pd

In [23]:
x=pd.DataFrame({"X1":[1,2,3,4,5], "X2":[10,20,30,40,50]})
x

Unnamed: 0,X1,X2
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


### scikit learn을 활용한 데이터 표준화하기

In [20]:
!pip install scikit-learn



In [22]:
# MinMasScaler 메모리에 로딩
from sklearn.preprocessing import MinMaxScaler

In [24]:
# MinMaxScaler 객체생성
scaler = MinMaxScaler()
scaled = scaler.fit_transform(x)

In [25]:
scaled

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [0.75, 0.75],
       [1.  , 1.  ]])

In [None]:
# docstring 불러오기: shift + tab
# 자동완성: tab

In [26]:
pd.DataFrame(scaled, columns=['X1','X2'])

Unnamed: 0,X1,X2
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


## StandardScaler로 표준화하기

In [27]:
from sklearn.preprocessing import StandardScaler

In [30]:
ss_scaler = StandardScaler()
Z = ss_scaler.fit_transform(x)
pd.DataFrame(Z, columns=['X1','X2'])

Unnamed: 0,X1,X2
0,-1.414214,-1.414214
1,-0.707107,-0.707107
2,0.0,0.0
3,0.707107,0.707107
4,1.414214,1.414214


## 범위와 사분위 범위계산하기

In [31]:
import numpy as np

In [None]:
# x (소문자): 컬럼이 1개일 경우
# X (대문자): 컬럼이 2개 이상일 경우

In [33]:
# np.random.normal(평균, 표준편차, size = 갯수)
x = np.random.normal(100, 20, size = 1000)
x

array([ 84.60377537,  79.91156601,  82.27858007,  96.50300394,
       112.1366555 ,  79.52043616,  79.08835866,  91.83195875,
        82.48441572, 114.98443971, 111.64844169, 120.37112757,
       116.62433738,  81.52961551, 129.02150944,  97.65129441,
        77.15713165,  73.3968482 ,  82.04695875, 122.29041176,
       136.97966125, 136.96466581, 106.2893964 ,  94.91496086,
       122.01954342, 134.72857891, 100.31222435,  88.63999894,
       102.05571788,  70.93570427,  95.70981093, 102.21975728,
       102.61842979, 107.50986359, 146.41813285, 119.07681736,
       105.76191895, 104.71080485,  99.57133399, 103.96666081,
        99.85525025,  82.35546237,  94.11190891,  67.61966542,
       109.65808609, 116.64533868,  77.06080269,  95.83361204,
        94.13727571,  89.22229271,  81.74488352,  84.03619634,
        97.41412428,  89.98842098,  91.57444502, 121.21117256,
        83.41811606,  96.05195753, 123.77788078, 109.26486011,
        80.15548676,  94.97781946, 126.53465014, 113.90

## 범위 계산

단축키
- m: 문자 셀로 변환
- y: 코드 셀로 변환

In [42]:
print(np.ptp(x))
print(np.max(x)-np.min(x))

122.51978274065985
122.51978274065985


In [43]:
# import scipy stats as st
# print(st.iqr(x))

print(np.quantile(x, 0.75) - np.quantile(x,0.25))
print(stats.iqr(x))

27.722108932359006
27.722108932359006
