In [None]:
import os
import numpy as np
import pandas as pd
from scipy import stats
from plt_rce import *
import pingouin as pg

In [None]:
os.getcwd()

In [None]:
os.chdir('../data')

In [None]:
sorted(os.listdir())

In [None]:
df = pd.read_pickle('Used_Cars.pkl')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().round(2)

## 상관분석

### 피어슨 상관분석

In [None]:
stats.pearsonr(x=df['Age'], y=df['Price'])
# PearsonRResult(statistic=np.float64(-0.796544729051927), pvalue=np.float64(5.8796109422549815e-279))

In [None]:
pg.corr(x=df['Age'], y=df['Price'])
#               n	    r	    CI95%	        p-val	        BF10	    power
# pearson	1268	-0.796545	[-0.82, -0.78]	5.879611e-279	6.169e+274	1.0

### 함수로 만들어서 사용

In [None]:
x = df['Age']

In [None]:
pg.corr(x=x, y=df['Price'])['p-val']

In [None]:
corr = lambda x: pg.corr(x=x, y=df['Price'])['p-val']

In [None]:
corr(x=df['Age'])
# pearson    5.879611e-279
# Name: p-val, dtype: float64

### apply 메서드 활용

In [None]:
df_num = df.select_dtypes(include=[int, float])
df_num.head()
# Price	Age	KM	HP	CC	Doors	Weight
# 0	13500	23	46986	90	2000	3	1165
# 1	13750	23	72937	90	2000	3	1165
# 2	13950	24	41711	90	2000	3	1165
# 3	14950	26	48000	90	2000	3	1165
# 4	13750	30	38500	90	2000	3	1170

In [None]:
df_num.apply(func=corr).lt(0.05)
#           Price	Age	    KM	    HP	    CC	    Doors	Weight
# pearson	True	True	True	True	False	True	True

In [None]:
coef = lambda x: pg.corr(x=x, y=df['Price'])['r']

In [None]:
df_num.apply(func=coef)

## 독립표본 t-검정
### 두 집단의 기술통계량 확인

In [None]:
pvt = pd.pivot_table(
    data=df,
    values='Price',
    index='MetColor',
    aggfunc=['count', 'mean', 'std']
)
# pvt.columns = pvt.columns.droplevel(1)
pvt

### 정규성 검정

In [None]:
df['MetColor'].unique()

In [None]:
y1 = df.loc[df['MetColor'].eq('0'), 'Price']
y1

In [None]:
y2 = df.loc[df['MetColor'].eq('1'), 'Price']
y2

In [None]:
stats.shapiro(y1)
# ShapiroResult(statistic=np.float64(0.9880744781614216), pvalue=np.float64(0.0014308065586837325))
stats.shapiro(y2)
# ShapiroResult(statistic=np.float64(0.9747554209498401), pvalue=np.float64(7.0629424881052e-11))

In [None]:
# dv : dependent variable(종속변수)
pg.normality(data=df, dv='Price', group='MetColor')
#          W	        pval	normal
# MetColor			
# 1	0.974755	7.062942e-11	False
# 0	0.988074	1.430807e-03	False

### 등분산성 검정

In [None]:
pg.homoscedasticity(data=df, dv='Price', group='MetColor')
#                   W	pval	equal_var
# levene	5.761315	0.016526	False

### t-검정

In [None]:
pg.ttest(x=y1, y=y2, correction=True)
#                   T	        dof	alternative	p-val	        CI95%	        cohen-d	    BF10	power
# T-test	-2.983633	935.893141	two-sided	0.002922	[-576.0, -118.92]	0.171735	5.313	0.82371

### 맨 휘트니 u 검정

In [None]:
pg.mwu(x=y1, y=y2)

### 실습

In [None]:
df['Automatic'].value_counts()

In [None]:
auto_y1 = df.loc[df['Automatic'].eq('0'), 'Price']
auto_y2 = df.loc[df['Automatic'].eq('1'), 'Price']

In [None]:
# 정규성 검정
pg.normality(data=df, dv='Price', group='Automatic')

In [None]:
# 등분산성 검정
pg.homoscedasticity(data=df, dv='Price', group='Automatic')

In [None]:
# t-test
pg.ttest(x=auto_y1, y=auto_y2, correction=False)

In [None]:
pg.mwu(x=auto_y1, y=auto_y2)

## 모평균 검정

In [None]:
df1 = pd.read_csv('https://bit.ly/sample_ttest')

In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
df1.describe()

In [None]:
sns.kdeplot(data=df1, x='before', label='Before')
sns.kdeplot(data=df1, x='after', label='After')

plt.axvline(x=df1['before'].mean(), color='blue', linestyle='-')
plt.axvline(x=df1['after'].mean(), color='orange', linestyle='--')

plt.legend()

plt.show()

## 단일표본 t-검정

In [None]:
pg.normality(df1['before'])
#               W	pval	normal
# before	0.985883	0.951249	True

In [None]:
pg.ttest(x=df1['before'], y=90)

In [None]:
pg.ttest(x=df1['before'], y=90, alternative='less')

## 대응표본 t-검정

In [None]:
pg.normality(df1['after'])
#               W	pval	normal
# after	0.985143	0.9396	True

In [None]:
pg.ttest(x=df1['before'], y=df1['after'], paired=True)

In [None]:
pg.ttest(x=df1['before'], y=df1['after'], paired=True, alternative='greater')

## 분산분석

## 세 집단의 기술통계량 확인

In [None]:
pvt = pd.pivot_table(
    data=df, values='Price', index='FuelType',
    aggfunc=['count', 'mean', 'std']
)
pvt.columns = pvt.columns.droplevel(1)
pvt

In [None]:
pg.normality(data=df, dv='Price', group='FuelType')

In [None]:
pg.homoscedasticity(data=df, dv='Price', group='FuelType')

In [None]:
pg.welch_anova(data=df, dv='Price', between='FuelType')

In [None]:
pg.kruskal(data=df, dv='Price', between='FuelType')

- 표본 크기가 작으면, 큰 차이도 유의하지 않다고 나올 수 있고
- 표본 크기가 크면, 작은 차이도 유의하다고 나올 수 있음

In [None]:
sns.kdeplot(data=df, x='Price', hue='FuelType', fill=True)
plt.axvline(x=df.loc[df['FuelType'].eq('CNG'), 'Price'].mean(), color='green')
plt.axvline(x=df.loc[df['FuelType'].eq('Diesel'), 'Price'].mean(), color='blue')
plt.axvline(x=df.loc[df['FuelType'].eq('Petrol'), 'Price'].mean(), color='orange')

plt.show()

## 사후분석

In [None]:
import scikit_posthocs as sp

In [None]:
sp.posthoc_tukey(a=df, val_col='Price', group_col='FuelType')

In [None]:
sp.posthoc_scheffe(a=df, val_col='Price', group_col='FuelType')

In [None]:
sp.posthoc_tamhane(a=df, val_col='Price', group_col='FuelType')

In [None]:
sp.posthoc_nemenyi(a=df, val_col='Price', group_col='FuelType')

## CNG 제거 후 검정

In [None]:
df = df.loc[df['FuelType'].ne('CNG'), :]

In [None]:
pg.homoscedasticity(data=df, dv='Price', group='FuelType')

In [None]:
y1 = df.loc[df['FuelType'].eq('Diesel'), 'Price']
y2 = df.loc[df['FuelType'].eq('Petrol'), 'Price']

In [None]:
pg.ttest(x=y1, y=y2, correction=True)

## 카이제곱 검정

In [None]:
df2 = pd.read_csv('https://bit.ly/sample_cross')

In [None]:
df2.head()

In [None]:
df2.info()

In [None]:
df2.describe(include=object)

### 교차 테이블 확인

In [None]:
pd.crosstab(
    index=df2['Coupon'],
    columns=df2['Purchase'],
    margins=True,
    margins_name='합계',
    normalize='index'
)

### 카이제곱 검정 실행

In [None]:
test = pg.chi2_independence(data=df2, x='Coupon', y='Purchase', correction=True)
test[2]

## 모비율 검정

In [None]:
from statsmodels.stats.proportion import proportions_ztest

In [None]:
proportions_ztest(count=30, nobs=1000, value=0.02)
# (np.float64(1.8537599944001615), np.float64(0.06377350427039058))

In [None]:
proportions_ztest(count=[30, 45], nobs=[1000, 1500], value=0.0)
# (np.float64(0.0), np.float64(1.0))

## 변수 제거 및 외부 파일로 저장

In [None]:
df = df.drop(columns=['CC', 'Automatic'])
df = df.reset_index(drop=True)
df.shape

In [None]:
df.to_pickle('Used_Cars_Prep.pkl')

In [None]:
sorted(os.listdir())