## 파이프라인 구축

In [2]:
import pandas as pd
import pdpipe as pdp

#### 캐글 데이터 USA 주택 가격

In [3]:
df = pd.read_csv('C:/Users/고태영/Downloads/USA_Housing.csv')

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Avg. Area Income,5000.0,68583.11,10657.991214,17796.63119,61480.562388,68804.29,75783.34,107701.7
Avg. Area House Age,5000.0,5.977222,0.991456,2.644304,5.322283,5.970429,6.650808,9.519088
Avg. Area Number of Rooms,5000.0,6.987792,1.005833,3.236194,6.29925,7.002902,7.665871,10.75959
Avg. Area Number of Bedrooms,5000.0,3.98133,1.234137,2.0,3.14,4.05,4.49,6.5
Area Population,5000.0,36163.52,9925.650114,172.610686,29403.928702,36199.41,42861.29,69621.71
Price,5000.0,1232073.0,353117.626581,15938.657923,997577.135049,1232669.0,1471210.0,2469066.0


#### House size column 생성

In [14]:
def size(n):
    if n <= 6.3 :
        return 'Small'
    elif 6.3 < n <= 7.7:
        return 'Medium'
    elif 7.7 < n :
        return 'Big'
    
df['House_size'] = df['Avg. Area Number of Rooms'].apply(size)

#### 열 제거 파이프라인 생성

In [17]:
drop_age = pdp.ColDrop('Avg. Area Number of Rooms')
df2 = drop_age(df)

In [18]:
df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Avg. Area Income,5000.0,68583.11,10657.991214,17796.63119,61480.562388,68804.29,75783.34,107701.7
Avg. Area House Age,5000.0,5.977222,0.991456,2.644304,5.322283,5.970429,6.650808,9.519088
Avg. Area Number of Bedrooms,5000.0,3.98133,1.234137,2.0,3.14,4.05,4.49,6.5
Area Population,5000.0,36163.52,9925.650114,172.610686,29403.928702,36199.41,42861.29,69621.71
Price,5000.0,1232073.0,353117.626581,15938.657923,997577.135049,1232669.0,1471210.0,2469066.0


In [19]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Bedrooms  5000 non-null   float64
 3   Area Population               5000 non-null   float64
 4   Price                         5000 non-null   float64
 5   Address                       5000 non-null   object 
 6   House_size                    5000 non-null   object 
dtypes: float64(5), object(2)
memory usage: 273.6+ KB


&nbsp;

#### 파이프라인 추가 및 제거

열 삭제 + 원핫인코딩 파이프라인

In [24]:
pipeline = pdp.ColDrop('Avg. Area Number of Rooms')
pipeline += pdp.OneHotEncode('House_size')

df3 = pipeline(df)

In [26]:
df3.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
0,79545.458574,5.682861,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",1,0
1,79248.642455,6.0029,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",1,0
2,61287.067179,5.86589,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,63345.240046,7.188236,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,0,1
4,59982.197226,5.040555,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,0,0


로우 삭제 파이프라인

In [27]:
# 주택가격이 250000 이하면 drop, 초과면 keep 태그 표시
def price_tag(x):
    if x > 250000 :
        return 'keep'
    else :
        return 'drop'

In [28]:
# 태그 표시 함수 적용하는 파이프라인 객체 적용
pipeline += pdp.ApplyByCols('Price', price_tag, 'Price_tag', drop = False)
df4 = pipeline(df)

In [30]:
print('df 로우 수 :',len(df),'\ndf4 로우 수 :',len(df4),sep='')

df 로우 수 :5000
df4 로우 수 :5000


In [31]:
df4.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Bedrooms,Area Population,Price,Price_tag,Address,House_size_Medium,House_size_Small
0,79545.458574,5.682861,4.09,23086.800503,1059034.0,keep,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",1,0
1,79248.642455,6.0029,3.09,40173.072174,1505891.0,keep,"188 Johnson Views Suite 079\nLake Kathleen, CA...",1,0
2,61287.067179,5.86589,5.13,36882.1594,1058988.0,keep,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,63345.240046,7.188236,3.26,34310.242831,1260617.0,keep,USS Barnett\nFPO AP 44820,0,1
4,59982.197226,5.040555,4.23,26354.109472,630943.5,keep,USNS Raymond\nFPO AE 09386,0,0


In [35]:
# ValDrop 메소드로 price tag가 drop인 row 삭제
pipeline += pdp.ValDrop(['drop'], 'Price_tag')
df5 = pipeline(df)
print('df 로우 수 :',len(df),'\ndf5 로우 수 :',len(df5),sep=' ')

df 로우 수 : 5000 
df5 로우 수 : 4990


In [34]:
df5.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Bedrooms,Area Population,Price,Price_tag,Address,House_size_Medium,House_size_Small
0,79545.458574,5.682861,4.09,23086.800503,1059034.0,keep,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",1,0
1,79248.642455,6.0029,3.09,40173.072174,1505891.0,keep,"188 Johnson Views Suite 079\nLake Kathleen, CA...",1,0
2,61287.067179,5.86589,5.13,36882.1594,1058988.0,keep,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,63345.240046,7.188236,3.26,34310.242831,1260617.0,keep,USS Barnett\nFPO AP 44820,0,1
4,59982.197226,5.040555,4.23,26354.109472,630943.5,keep,USNS Raymond\nFPO AE 09386,0,0


In [36]:
# price tag column 삭제
pipeline += pdp.ColDrop('Price_tag')
df6 = pipeline(df)
df6.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
0,79545.458574,5.682861,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",1,0
1,79248.642455,6.0029,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",1,0
2,61287.067179,5.86589,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,63345.240046,7.188236,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,0,1
4,59982.197226,5.040555,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,0,0


In [39]:
def extract_state(token):
    return str(token[-2])

pipeline_tokenizer = pdp.TokenizeWords('Address')
pipeline_state = pdp.ApplyByCols('Address', extract_state, result_columns='State')
pipeline_state_extract = pipeline_tokenizer + pipeline_state

AttributeError: module 'pdpipe' has no attribute 'TokenizeWords'