## 범주형 변수의 빈도 생성
- 우리가 찾으려는 것의 90%는 빈도분포에서 찾을 수 있다.
- 데이터프레임에 단방향, 양방향의 빈도분포(크로스탭)을 많이 수행할 수록 더 잘 이해할 수 있다.

### 단방향 빈도분포
- value_counts
- 범주형 변수 그 자체로만 인사이트를 도출해내는 것

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',None)

nls97=pd.read_csv('C:/data-cleansing-main/Chapter03/data/nls97.csv')
nls97.set_index('personid',inplace=True)

In [2]:
nls97.loc[:,nls97.dtypes=='object'] = nls97.select_dtypes(['object']).apply(lambda x:x.astype('category'))      # 객체 데이터형을 범주형으로 바꿈
                                                                                                                # apply와 lambda 같이 자주 씀! 간단한 함수

  nls97.loc[:,nls97.dtypes=='object'] = nls97.select_dtypes(['object']).apply(lambda x:x.astype('category'))


In [8]:
# 범주 자료형의 컬럼 출력, 결측값 확인
catcols=nls97.select_dtypes(include=['category']).columns           # select_dtypes 사용 시 include, exclude로 컬럼 지정, columns로 컬럼명만 가져올 수 있음
nls97[catcols].isna().sum()

gender                      0
maritalstatus            2312
weeklyhrscomputer        2274
weeklyhrstv              2273
highestdegree              31
govprovidejobs           7151
govpricecontrols         7125
govhealthcare            7110
govelderliving           7112
govindhelp               7169
govunemp                 7173
govincomediff            7209
govcollegefinance        7109
govdecenthousing         7137
govprotectenvironment    7124
colenrfeb97              7734
colenroct97               483
colenrfeb98               483
colenroct98                96
colenrfeb99               119
colenroct99               133
colenrfeb00               164
colenroct00               179
colenrfeb01               198
colenroct01               226
colenrfeb02               252
colenroct02               286
colenrfeb03               326
colenroct03               362
colenrfeb04               406
colenroct04               438
colenrfeb05               476
colenroct05               513
colenrfeb0

In [9]:
# 특정 컬럼의 빈도 출력
nls97.maritalstatus.value_counts()

Married          3066
Never-married    2766
Divorced          663
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64

In [10]:
nls97.maritalstatus.value_counts(sort=False)        # 빈도순 정렬 해제

Divorced          663
Married          3066
Never-married    2766
Separated         154
Widowed            23
Name: maritalstatus, dtype: int64

In [11]:
nls97.maritalstatus.value_counts(sort=False,normalize=True)     # 빈도를 비율로 표시

Divorced         0.099371
Married          0.459532
Never-married    0.414568
Separated        0.023082
Widowed          0.003447
Name: maritalstatus, dtype: float64

In [12]:
# 정부의 책임과 관련된 열 전체를 비율로 표시
nls97.filter(like='gov').apply(pd.value_counts,normalize=True)      # pd.value_counts는 뭐지?

Unnamed: 0,govprovidejobs,govpricecontrols,govhealthcare,govelderliving,govindhelp,govunemp,govincomediff,govcollegefinance,govdecenthousing,govprotectenvironment
1. Definitely,0.247681,0.541689,0.665422,0.700321,0.42865,0.218112,0.324507,0.7344,0.442339,0.668817
2. Probably,0.336607,0.334051,0.271078,0.247863,0.411019,0.403092,0.284507,0.2304,0.433676,0.286559
3. Probably not,0.252046,0.086606,0.045358,0.037927,0.119008,0.262838,0.228732,0.026667,0.100162,0.02957
4. Definitely not,0.163666,0.037655,0.018143,0.013889,0.041322,0.115958,0.162254,0.008533,0.023822,0.015054


In [16]:
nls97[nls97.maritalstatus=="Married"].filter(like='gov').apply(pd.value_counts,normalize=True)      # 데이터프레임[찾을 것]

Unnamed: 0,govprovidejobs,govpricecontrols,govhealthcare,govelderliving,govindhelp,govunemp,govincomediff,govcollegefinance,govdecenthousing,govprotectenvironment
1. Definitely,0.173315,0.459864,0.56445,0.632293,0.372549,0.147265,0.25889,0.695418,0.356354,0.644505
2. Probably,0.328748,0.380952,0.359566,0.310719,0.445378,0.395512,0.273115,0.261456,0.493094,0.313433
3. Probably not,0.314993,0.112925,0.05156,0.037992,0.142857,0.328191,0.284495,0.033693,0.120166,0.028494
4. Definitely not,0.182944,0.046259,0.024423,0.018996,0.039216,0.129032,0.183499,0.009434,0.030387,0.013569


In [22]:
# 범주형인 열 전체에 대해 빈도와 비율 찾기
freqout=open('C:/data-cleansing-main/frequencies.txt','w')      # txt 파일 만들기

In [23]:
for col in nls97.select_dtypes(include=['category']):
    print(col,'------------','frequencies',
          nls97[col].value_counts(sort=False),'percentages',
          nls97[col].value_counts(normalize=True,sort=False),
          sep='\n\n',end='\n\n\n',file=freqout)
    
freqout.close()