## 조건에 따른 시리즈 값 변경

In [1]:
import pandas as pd
import numpy as np

nls97=pd.read_csv('C:/data-cleansing-main/Chapter06/data/nls97b.csv')
nls97.set_index('personid',inplace=True)
landtemps=pd.read_csv('C:/data-cleansing-main/Chapter06/data/landtemps2019avgs.csv')

### 넘파이 - where
- 범주형 시리즈 생성

In [2]:
landtemps.elevation.quantile(np.arange(0.2,1.1,0.2))

0.2      48.0
0.4     190.5
0.6     393.2
0.8    1066.8
1.0    9999.0
Name: elevation, dtype: float64

In [5]:
# where 함수를 사용해 두 값을 포함하는 범주형 시리즈 생성
landtemps['elevation_group']=np.where(landtemps.elevation > landtemps.elevation.quantile(0.8),'High','Low')
landtemps.elevation_group=landtemps.elevation_group.astype('category')
landtemps.groupby(['elevation_group'])['elevation'].agg(['count','min','max'])

Unnamed: 0_level_0,count,min,max
elevation_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,2409,1067.0,9999.0
Low,9686,-350.0,1066.8


In [6]:
# where 함수를 사용해 세 값을 포함하는 범주형 시리즈 생성
landtemps['elevation_group']=np.where(landtemps.elevation > landtemps.elevation.quantile(0.8),'High',
                                      np.where(landtemps.elevation > landtemps.elevation.median(),'Median','Low'))      # where 안에서 또 where 사용
landtemps.elevation_group=landtemps.elevation_group.astype('category')
landtemps.groupby(['elevation_group'])['elevation'].agg(['count','min','max'])

Unnamed: 0_level_0,count,min,max
elevation_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
High,2409,1067.0,9999.0
Low,6056,-350.0,271.3
Median,3630,271.4,1066.8


### 넘파이 - select  ??? 잘 모르겠음
- 조건의 리스트 평가

In [9]:
test=[(nls97.gpaoverall<2) & (nls97.highestdegree=='0. None'), nls97.highestdegree=='0. None', nls97.gpaoverall<2]
result = ['1. Low GPA and No Diploma','2. No Diploma','3. Low GPA']
nls97['hsachieve']=np.select(test,result,'4. Did Okay')
nls97[['hsachieve','gpaoverall','highestdegree']].head()

Unnamed: 0_level_0,hsachieve,gpaoverall,highestdegree
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,4. Did Okay,3.06,2. High School
100139,4. Did Okay,,2. High School
100284,2. No Diploma,,0. None
100292,4. Did Okay,3.45,4. Bachelors
100583,4. Did Okay,2.91,2. High School


In [11]:
nls97.hsachieve.value_counts().sort_index()

1. Low GPA and No Diploma      95
2. No Diploma                 858
3. Low GPA                    459
4. Did Okay                  7572
Name: hsachieve, dtype: int64

### 💥lambda 를 사용해 여러 개의 열을 한 문장으로 테스트

In [15]:
nls97['baenrollment']=nls97.filter(like='colenr').apply(lambda x:x.str[0:1]=='3').any(axis=1)       # colenr 열에는 매년 2~10월의 개인별 학적 상태가 있음. 
nls97.baenrollment.value_counts()

False    5085
True     3899
Name: baenrollment, dtype: int64

### 여러 시리즈의 값을 바탕으로 값을 할당하는 함수 생성

In [17]:
def getsleepdeprivedreason(row):
  sleepdeprivedreason = "Unknown"
  if (row.nightlyhrssleep>=6):
    sleepdeprivedreason = "Not Sleep Deprived"
  elif (row.nightlyhrssleep>0):
    if (row.weeksworked16+row.weeksworked17 < 80):
      if (row.childathome>2):
        sleepdeprivedreason = "Child Rearing"
      else:
        sleepdeprivedreason = "Other Reasons"
    else:
      if (row.wageincome>=62000 or row.highestgradecompleted>=16):
        sleepdeprivedreason = "Work Pressure"
      else:
        sleepdeprivedreason = "Income Pressure"
  else:
    sleepdeprivedreason = "Unknown"
  return sleepdeprivedreason

In [18]:
# apply를 사용해 전체 행에 대해 함수 실행
nls97['sleepdeprivedreason']=nls97.apply(getsleepdeprivedreason,axis=1)     # axis=1을 지정하면, 데이터프레임의 각 행을 함수로 보냄
nls97.sleepdeprivedreason=nls97.sleepdeprivedreason.astype('category')      # object를 category로 바꿔 메모리 사용을 줄임
nls97.sleepdeprivedreason.value_counts()

Not Sleep Deprived    5595
Unknown               2286
Income Pressure        462
Work Pressure          281
Other Reasons          272
Child Rearing           88
Name: sleepdeprivedreason, dtype: int64