## 열 선택 및 정돈
- 데이터프레임에서 복수의 열을 전달하는 방법
1. [ ]을 이용해서 컬럼명으로 전달
2. loc, iloc 이용해서 전달

### 열 선택

In [19]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns',None)

nls97=pd.read_csv('C:/data-cleansing-main/Chapter03/data/nls97.csv')
nls97.set_index('personid',inplace=True)
# 전체 컬럼 중 객체 데이터형인 것만 골라서 범주형으로 바꾸기
nls97.loc[:,nls97.dtypes=='object']=nls97.select_dtypes(['object']).apply(lambda x:x.astype('category'))


  nls97.loc[:,nls97.dtypes=='object']=nls97.select_dtypes(['object']).apply(lambda x:x.astype('category'))


In [20]:
analysisdemo=nls97['gender']        # 열 이름과 일치하는 문자열을 대괄호 연산자에 전달하면, 시리즈가 반환됨
print(type(analysisdemo))

analysisdemo

<class 'pandas.core.series.Series'>


personid
100061    Female
100139      Male
100284      Male
100292      Male
100583      Male
           ...  
999291    Female
999406      Male
999543    Female
999698    Female
999963    Female
Name: gender, Length: 8984, dtype: category
Categories (2, object): ['Female', 'Male']

In [21]:
analysisdemo=nls97[['gender']]        # 컬럼명 한 개만을 원소로 갖는 리스트를 전달하면, 데이터프레임이 반환됨
print(type(analysisdemo))

analysisdemo

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0_level_0,gender
personid,Unnamed: 1_level_1
100061,Female
100139,Male
100284,Male
100292,Male
100583,Male
...,...
999291,Female
999406,Male
999543,Female
999698,Female


In [25]:
analysisdemo=nls97.loc[:,['gender']]    # loc 사용
type(analysisdemo)
analysisdemo

Unnamed: 0_level_0,gender
personid,Unnamed: 1_level_1
100061,Female
100139,Male
100284,Male
100292,Male
100583,Male
...,...
999291,Female
999406,Male
999543,Female
999698,Female


In [27]:
analysisdemo=nls97.iloc[:,[0]]      # iloc 사용
type(analysisdemo)
analysisdemo

Unnamed: 0_level_0,gender
personid,Unnamed: 1_level_1
100061,Female
100139,Male
100284,Male
100292,Male
100583,Male
...,...
999291,Female
999406,Male
999543,Female
999698,Female


In [28]:
analysisdemo = nls97[['gender','maritalstatus', 'highestgradecompleted']]       # 대괄호로 복수의 열 불러오기
analysisdemo.shape
analysisdemo.head()

analysisdemo = nls97.loc[:,['gender','maritalstatus','highestgradecompleted']]  # loc로 복수의 열 불러오기, loc[행,열]
analysisdemo.shape
analysisdemo.head()

# use lists to select multiple columns
keyvars = ['gender','maritalstatus', 'highestgradecompleted','wageincome', 'gpaoverall','weeksworked17','colenroct17']  # 열 이름이 많을 경우 따로 변수로 만들기
analysiskeys = nls97[keyvars]
analysiskeys.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   gender                 8984 non-null   category
 1   maritalstatus          6672 non-null   category
 2   highestgradecompleted  6663 non-null   float64 
 3   wageincome             5091 non-null   float64 
 4   gpaoverall             6004 non-null   float64 
 5   weeksworked17          6670 non-null   float64 
 6   colenroct17            6734 non-null   category
dtypes: category(3), float64(4)
memory usage: 377.8 KB


In [33]:
# 컬럼명 필터링 - filter
analysiswork=nls97.filter(like='weeksworked')       # filter를 이용해 weeksworked가 포함된 열 모두 선택
analysiswork.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   weeksworked00  8603 non-null   float64
 1   weeksworked01  8564 non-null   float64
 2   weeksworked02  8556 non-null   float64
 3   weeksworked03  8490 non-null   float64
 4   weeksworked04  8458 non-null   float64
 5   weeksworked05  8403 non-null   float64
 6   weeksworked06  8340 non-null   float64
 7   weeksworked07  8272 non-null   float64
 8   weeksworked08  8186 non-null   float64
 9   weeksworked09  8146 non-null   float64
 10  weeksworked10  8054 non-null   float64
 11  weeksworked11  7968 non-null   float64
 12  weeksworked12  7747 non-null   float64
 13  weeksworked13  7680 non-null   float64
 14  weeksworked14  7612 non-null   float64
 15  weeksworked15  7389 non-null   float64
 16  weeksworked16  7068 non-null   float64
 17  weeksworked17  6670 non-null   float64
dtypes

In [50]:
# 자료형 별 필터링 - select_dtypes
analyisnums=nls97.select_dtypes(include=['number'])     # 숫자 자료형인 열
analyisnums.info()

analyiscats=nls97.select_dtypes(include=['category'])   # 범주 자료형인 열
analyiscats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   highestgradecompleted  6663 non-null   float64
 1   childathome            4791 non-null   float64
 2   childnotathome         4791 non-null   float64
 3   wageincome             5091 non-null   float64
 4   nightlyhrssleep        6706 non-null   float64
 5   birthmonth             8984 non-null   int64  
 6   birthyear              8984 non-null   int64  
 7   satverbal              1406 non-null   float64
 8   satmath                1407 non-null   float64
 9   gpaoverall             6004 non-null   float64
 10  gpaenglish             5798 non-null   float64
 11  gpamath                5766 non-null   float64
 12  gpascience             5684 non-null   float64
 13  weeksworked00          8603 non-null   float64
 14  weeksworked01          8564 non-null   float64
 1

In [39]:
# 열 이름의 리스트를 사용해 열 정돈 - 원하는 순서대로 열을 정렬해서 새로운 데이터프레임 생성
demo = ['gender','birthmonth','birthyear']
highschoolrecord = ['satverbal','satmath','gpaoverall', 'gpaenglish','gpamath','gpascience']
govresp = ['govprovidejobs','govpricecontrols', 'govhealthcare','govelderliving','govindhelp', 'govunemp','govincomediff','govcollegefinance',
  'govdecenthousing','govprotectenvironment']
demoadult = ['highestgradecompleted','maritalstatus', 'childathome','childnotathome','wageincome', 'weeklyhrscomputer','weeklyhrstv','nightlyhrssleep',
  'highestdegree']
weeksworked = ['weeksworked00','weeksworked01', 'weeksworked02','weeksworked03','weeksworked04', 'weeksworked05','weeksworked06',  'weeksworked07',
  'weeksworked08','weeksworked09','weeksworked10', 'weeksworked11','weeksworked12','weeksworked13', 'weeksworked14','weeksworked15','weeksworked16',
  'weeksworked17']
colenr = ['colenrfeb97','colenroct97','colenrfeb98','colenroct98','colenrfeb99',  'colenroct99','colenrfeb00','colenroct00','colenrfeb01',
  'colenroct01','colenrfeb02','colenroct02','colenrfeb03','colenroct03','colenrfeb04','colenroct04','colenrfeb05','colenroct05',
  'colenrfeb06','colenroct06','colenrfeb07','colenroct07','colenrfeb08','colenroct08','colenrfeb09','colenroct09','colenrfeb10',
  'colenroct10','colenrfeb11','colenroct11','colenrfeb12','colenroct12','colenrfeb13','colenroct13',  'colenrfeb14','colenroct14',
  'colenrfeb15','colenroct15','colenrfeb16','colenroct16','colenrfeb17','colenroct17']

nls97 = nls97[demoadult + demo + highschoolrecord + govresp + weeksworked + colenr]
nls97.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 88 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   highestgradecompleted  6663 non-null   float64 
 1   maritalstatus          6672 non-null   category
 2   childathome            4791 non-null   float64 
 3   childnotathome         4791 non-null   float64 
 4   wageincome             5091 non-null   float64 
 5   weeklyhrscomputer      6710 non-null   category
 6   weeklyhrstv            6711 non-null   category
 7   nightlyhrssleep        6706 non-null   float64 
 8   highestdegree          8953 non-null   category
 9   gender                 8984 non-null   category
 10  birthmonth             8984 non-null   int64   
 11  birthyear              8984 non-null   int64   
 12  satverbal              1406 non-null   float64 
 13  satmath                1407 non-null   float64 
 14  gpaoverall             6004 non-n

In [51]:
# select_dtypes 로 특정 열 제외
nls97.select_dtypes(exclude=['category']).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8984 entries, 100061 to 999963
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   highestgradecompleted  6663 non-null   float64
 1   childathome            4791 non-null   float64
 2   childnotathome         4791 non-null   float64
 3   wageincome             5091 non-null   float64
 4   nightlyhrssleep        6706 non-null   float64
 5   birthmonth             8984 non-null   int64  
 6   birthyear              8984 non-null   int64  
 7   satverbal              1406 non-null   float64
 8   satmath                1407 non-null   float64
 9   gpaoverall             6004 non-null   float64
 10  gpaenglish             5798 non-null   float64
 11  gpamath                5766 non-null   float64
 12  gpascience             5684 non-null   float64
 13  weeksworked00          8603 non-null   float64
 14  weeksworked01          8564 non-null   float64
 1

In [52]:
# filter 연산자 심화 - 정규식
nls97.filter(regex='income')        # income이 있는 열 선택

Unnamed: 0_level_0,wageincome,govincomediff
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
100061,12500.0,
100139,120000.0,
100284,58000.0,
100292,,
100583,30000.0,
...,...,...
999291,35000.0,
999406,116000.0,
999543,,
999698,,
