# Example of pipelining with Pandas with `pdpipe`
### Dr. Tirthajyoti Sarkar, Fremont, CA, Nov 2019

In [1]:
import pandas as pd
import numpy as np
import pdpipe as pdp

In [2]:
df = pd.read_csv("USA_Housing.csv")

In [3]:
round(df.sample(5),2)

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
3274,57944.66,5.77,7.02,4.3,45260.52,1137224.58,"357 Hanson Wall Apt. 560\nFranklinport, SD 426..."
3854,70030.02,6.22,7.66,3.05,26782.56,1261715.46,"31625 Forbes Terrace Apt. 550\nMasonview, WY 9..."
1671,78963.89,6.86,5.73,3.2,59974.63,2004396.37,"9461 Matthew Summit\nNorth Kathleen, NC 35103"
3629,81376.1,7.41,6.0,3.03,40151.93,1788829.24,52901 Griffin Spurs Apt. 216\nPort Williamches...
3650,70299.02,5.7,6.56,2.0,33147.37,1172619.6,"4947 Shawn Greens\nSouth Crystalville, NJ 68094"


In [4]:
df.shape

(5000, 7)

In [5]:
df.columns

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

In [6]:
round(df.describe().T,2)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Avg. Area Income,5000.0,68583.11,10657.99,17796.63,61480.56,68804.29,75783.34,107701.75
Avg. Area House Age,5000.0,5.98,0.99,2.64,5.32,5.97,6.65,9.52
Avg. Area Number of Rooms,5000.0,6.99,1.01,3.24,6.3,7.0,7.67,10.76
Avg. Area Number of Bedrooms,5000.0,3.98,1.23,2.0,3.14,4.05,4.49,6.5
Area Population,5000.0,36163.52,9925.65,172.61,29403.93,36199.41,42861.29,69621.71
Price,5000.0,1232072.65,353117.63,15938.66,997577.14,1232669.38,1471210.2,2469065.59


In [7]:
def size(n):
    if n<=4:
        return 'Small'
    elif 4<n<=6:
        return 'Medium'
    else:
        return 'Big'

df['House_size']=df['Avg. Area Number of Rooms'].apply(size)

In [8]:
df['House_size']=df['Avg. Area Number of Rooms'].apply(size)

In [9]:
round(df.sample(5),2)

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size
3920,66749.57,7.16,6.1,3.37,46553.98,1394637.65,1630 Castillo Summit Apt. 619\nEast Karenborou...,Big
2267,76992.82,6.53,6.05,4.46,24874.04,1212440.29,"8188 Henry Gardens\nPort Rogerberg, OK 27962-5231",Big
4302,61877.55,5.55,6.8,3.25,28604.94,1083745.37,"03529 Hull Mountains\nLake Zacharyshire, WY 94496",Big
4320,64426.87,5.5,8.95,3.31,18214.31,1055548.68,Unit 4466 Box 0788\nDPO AP 45943,Big
1796,78159.95,5.12,6.18,4.08,31111.62,1264972.44,"3276 Harris Pines Suite 433\nPort Paul, NY 662...",Big


### Drop a column

In [10]:
drop_age = pdp.ColDrop('Avg. Area House Age')

In [11]:
df2 = drop_age(df)

In [12]:
round(df2.sample(5))

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size
1384,43922.0,7.0,5.0,42251.0,747933.0,3112 Wells Parkway Apt. 414\nSouth Emilycheste...,Big
2442,43242.0,7.0,2.0,29479.0,629658.0,860 Graham Meadows Suite 412\nEast Kristenburg...,Big
2395,73612.0,8.0,4.0,38294.0,1487856.0,"827 Christina Divide\nNew Tanya, MO 26789-8710",Big
2768,51928.0,7.0,6.0,32257.0,970177.0,"8889 Ashley Dale\nNorth Robertmouth, IA 68137-...",Big
4820,50442.0,7.0,6.0,33440.0,644142.0,"0645 Arroyo Trail\nEast Michelle, LA 07906-8981",Big


### Chaining stages by adding them up

In [13]:
pipeline = pdp.ColDrop('Avg. Area House Age')
pipeline+= pdp.OneHotEncode('House_size')

In [14]:
df3 = pipeline(df)

In [15]:
round(df3.sample(5))

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
1534,59902.0,8.0,6.0,30211.0,1088549.0,"343 Samuel Pass Suite 476\nKimberlyberg, ME 70939",0,0
1682,64750.0,8.0,4.0,49537.0,1781211.0,"4293 Abigail Tunnel\nWest Brandi, ND 75636",0,0
502,90437.0,8.0,6.0,13340.0,1647279.0,"PSC 8841, Box 4711\nAPO AA 80496-2707",0,0
1069,82185.0,5.0,4.0,29259.0,1162735.0,Unit 4395 Box 2223\nDPO AE 68423-9498,1,0
4052,82115.0,7.0,4.0,46052.0,1796532.0,"591 Fernandez Ports\nNorth Angelaside, MP 72848",0,0


In [16]:
def price_tag(x):
    if x>250000:
        return 'keep'
    else:
        return 'drop'

In [17]:
pipeline = pdp.ColDrop('Avg. Area House Age')
pipeline+= pdp.OneHotEncode('House_size')
pipeline+=pdp.ApplyByCols('Price',price_tag,'Price_tag',drop=False)

In [18]:
df4 = pipeline(df)

In [19]:
df4.shape

(5000, 9)

In [20]:
round(df4.sample(5),2)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Price_tag,Address,House_size_Medium,House_size_Small
4512,71162.38,7.22,3.4,39134.55,1434240.36,keep,"4619 Angela Rest Apt. 586\nPaulport, IL 15870",0,0
2125,75434.55,8.25,4.04,43181.91,1613325.65,keep,"746 Steele Terrace Suite 937\nHughesborough, M...",0,0
2304,72110.5,7.05,5.33,23411.95,1208761.06,keep,"952 Richard Spring Apt. 574\nMoniquemouth, WA ...",0,0
3366,72261.2,7.35,3.07,42681.82,1568433.33,keep,"367 Christopher Isle\nNew Tina, MO 23475-8112",0,0
3916,46285.84,8.76,3.42,42246.02,1153321.6,keep,"4366 Lewis Crescent Suite 780\nPaynefurt, VT 6...",0,0


In [21]:
pipeline = pdp.ColDrop('Avg. Area House Age')
pipeline+= pdp.OneHotEncode('House_size')
pipeline+=pdp.ApplyByCols('Price',price_tag,'Price_tag',drop=False)
pipeline+=pdp.ValDrop(['drop'],'Price_tag')
pipeline+= pdp.ColDrop('Price_tag')

In [22]:
df5 = pipeline(df)

In [23]:
df5.shape

(4990, 8)

### Scikit-learn scaling

In [24]:
pipeline_scale = pdp.Scale('StandardScaler',exclude_columns=['House_size_Medium','House_size_Small'])

In [25]:
df6 = pipeline_scale(df5)

In [26]:
round(df6.sample(5),3)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
2063,2.067,-1.195,-1.379,-0.128,1.366,"06374 Martin Passage\nNew Shawnland, KS 59839-...",1,0
1001,-0.183,0.504,0.84,-1.328,-1.849,"827 Ferguson Isle\nRosebury, AL 61416-3167",0,0
3801,0.044,0.062,1.164,-0.352,0.3,Unit 8410 Box 5521\nDPO AP 20914-6877,0,0
4690,1.528,-0.738,0.071,0.123,0.048,"29600 Garcia Forest Suite 239\nWest Mark, NV 5...",0,0
3797,-1.155,-2.088,-1.443,-1.283,-1.671,"1629 James Pines\nPort John, LA 49420",1,0


### NLTK stages

In [27]:
pipeline_tokenize=pdp.TokenizeWords('Address')

In [28]:
df7 = pipeline_tokenize(df6)

In [29]:
df7.sample(5)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
2801,-2.16379,-0.161473,-1.32201,0.623847,-0.359488,"[525, Ashley, Course, Lake, Michelleville, ,, ...",0,0
646,-0.182429,-0.023628,-1.573084,0.104469,-0.187608,"[8961, Guerra, Motorway, Stephensburgh, ,, AR,...",0,0
838,-0.569561,0.185986,0.314022,0.532345,-0.206402,"[86533, Gould, Hills, Garciachester, ,, DE, 09...",0,0
4409,-0.320249,0.464002,0.014353,-1.373439,-0.458186,"[089, Smith, Gateway, Suite, 155, East, Christ...",0,0
1983,-0.004935,-0.580628,0.087245,0.836355,-0.598361,"[Unit, 8667, Box, 6237, DPO, AE, 76811-0261]",0,0


In [30]:
def extract_state(token):
    return str(token[-2])

In [31]:
pipeline_state = pdp.ApplyByCols('Address',extract_state,result_columns='State')

In [32]:
df8=pipeline_state(df7)

In [33]:
round(df8.sample(5),3)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,State,House_size_Medium,House_size_Small
2150,-0.902,0.709,1.156,-1.005,-1.566,AA,0,0
2330,0.042,0.092,-0.698,-0.412,-0.481,GU,0,0
3717,-1.454,0.991,1.011,-0.755,-1.877,CO,0,0
4187,-0.242,0.962,0.865,-0.975,-1.69,WV,0,0
260,-0.701,0.098,0.93,-0.989,-0.345,NV,0,0
