In [None]:
import pandas as pd


In [None]:
cols = "age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, income".split(', ')
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',names = cols)
data.head()

## normality test

In [None]:
from scipy.stats import normaltest
k2, p = normaltest(data['age'])

print(k2,p)
if p < 0.001 : print('not normal')
else: print('normal')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
data['age'].hist()
plt.show()

## correlation check

In [None]:
import numpy as np

np.random.seed(1)

# 1000 random integers between 0 and 50
x = np.random.randint(0, 50, 1000)

# Positive Correlation with some noise
y = x + np.random.normal(0, 10, 1000)

np.corrcoef(x, y)

In [None]:
plt.scatter(x, y)
plt.show()

In [None]:
adata =data[['age','education-num','capital-gain']]
adata.corr()

In [None]:
data.columns

In [None]:
plt.matshow(adata.corr())
plt.xticks(range(len(adata.columns)), adata.columns)
plt.yticks(range(len(adata.columns)), adata.columns)
plt.colorbar()
plt.show()

In [None]:
pd.plotting.scatter_matrix(adata, figsize=(6, 6))
plt.show()

## t test, u test

In [None]:
targetcol = 'income'
vc = data[targetcol].value_counts()
print(vc)
vc.index

In [None]:
from scipy.stats import ttest_ind, mannwhitneyu

col = 'age'
## 타겟 값 분포 차이 확인
g1 = data[data[targetcol] == vc.index[0]][col]
g2 = data[data[targetcol] == vc.index[1]][col]

## 차이가 의미가 있다면 분류에서도 도움이 될 것으로 기대할 수 있다.
t, ttestp = ttest_ind(g1, g2)
### 정규성 가정이 필요 없는 utest로 차이 검정
u, up = mannwhitneyu(g1, g2)
print(t,ttestp,u,up)
if up < 0.001 : print('not same dist')
else : print('same dist')

import matplotlib.pyplot as plt
plt.hist(g1,alpha=0.5)
plt.hist(g2,alpha=0.5)
plt.show()

## proportion test

In [None]:
from statsmodels.stats.proportion import proportions_ztest
col = 'sex'
targetcol = 'income'

dc = pd.DataFrame(data.groupby([col,targetcol]).size()).unstack(0)
dc.columns = dc.columns.droplevel()
dc.loc['sum'] = dc.sum()
dc.loc['ratio'] = dc.loc[' <=50K'] / dc.loc['sum']
display(dc)
c = dc.loc[' <=50K'].values
n = dc.loc['sum'].values
print(c,n)
print(proportions_ztest(nobs=n,count=c))


In [None]:
from statsmodels.stats.proportion import proportions_ztest
col = 'marital-status'
targetcol = 'income'
y = data[col]
n = data[data[targetcol] == vc.index[1]][col]
yv = pd.DataFrame(y.value_counts().reset_index())
nv = pd.DataFrame(n.value_counts().reset_index())
ynv = yv.merge(nv,how='outer',on='index')
display(ynv)

tvc= data[targetcol].value_counts()
display(tvc)
tp = float(tvc.values[1])/ sum(tvc.values)
print(tp)
c1 = ynv.iloc[:,1].fillna(0)
c2 = ynv.iloc[:,2].fillna(0)
zs = []
ps = []
for a1,t in zip(c1,c2):
    z,p = proportions_ztest(nobs=a1,count=t,value=tp)
    zs.append(z)
    ps.append(round(p,4))
rd = pd.DataFrame.from_dict({'값': ynv.iloc[:,0],'전체':c1,'타겟':c2,'타겟 비율':c2/c1,'전체 타겟 중 비율': c2/sum(c2),
                           'z' : zs, 'p':ps})
print(col)
display(rd)