In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

In [20]:
train = pd.read_csv('../datasets/clean/train_clean.csv')
test = pd.read_csv('../datasets/clean/test_clean.csv')

## Create polynomial features

In [21]:
# Identify non-dummy columns for potential polynomial features 
p_features = []
for col in train:
    if (len(train[col].unique()) > 2) & (col != 'SalePrice'):
        p_features.append(col)

In [22]:
p_features

['2nd Flr SF',
 '3Ssn Porch',
 'Bedroom AbvGr',
 'Bsmt Cond',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Bsmt Qual',
 'Bsmt Unf SF',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Enclosed Porch',
 'Exter Cond',
 'Fireplaces',
 'Full Bath',
 'Garage Area',
 'Garage Qual',
 'Gr Liv Area',
 'Half Bath',
 'Heating QC',
 'Kitchen AbvGr',
 'Kitchen Qual',
 'Lot Frontage',
 'Low Qual Fin SF',
 'Mas Vnr Area',
 'Misc Val',
 'Mo Sold',
 'Open Porch SF',
 'Overall Cond',
 'Overall Qual',
 'Screen Porch',
 'Total Bsmt SF',
 'Wood Deck SF',
 'Yr Sold']

In [23]:
# Select the most likely culprits
p_features = ['Gr Liv Area', 'Full Bath', 'Kitchen Qual', 'Mo Sold', 'Yr Sold', 'Lot Frontage', 'Overall Qual', 'Mas Vnr Area', 'Total Bsmt SF', 'Heating QC']

In [24]:
# Separate objects for train and test data transformations
poly_train = PolynomialFeatures(2, include_bias=False)
poly_test = PolynomialFeatures(2, include_bias=False)

poly_features = p_features

# Create features
train_poly = poly_train.fit_transform(train[poly_features])
test_poly = poly_test.fit_transform(test[poly_features])

# Create df's and view train poly features
poly_df = pd.DataFrame(train_poly, columns=poly_train.get_feature_names(poly_features))
poly_df_test = pd.DataFrame(test_poly, columns=poly_test.get_feature_names(poly_features))

poly_df['SalePrice'] = train['SalePrice']
poly_df.head()

Unnamed: 0,Gr Liv Area,Full Bath,Kitchen Qual,Mo Sold,Yr Sold,Lot Frontage,Overall Qual,Mas Vnr Area,Total Bsmt SF,Heating QC,...,Overall Qual Mas Vnr Area,Overall Qual Total Bsmt SF,Overall Qual Heating QC,Mas Vnr Area^2,Mas Vnr Area Total Bsmt SF,Mas Vnr Area Heating QC,Total Bsmt SF^2,Total Bsmt SF Heating QC,Heating QC^2,SalePrice
0,1479.0,2.0,3.0,3.0,2010.0,58.285434,6.0,289.0,725.0,4.0,...,1734.0,4350.0,24.0,83521.0,209525.0,1156.0,525625.0,2900.0,16.0,130500
1,2122.0,2.0,3.0,4.0,2009.0,43.0,7.0,132.0,913.0,4.0,...,924.0,6391.0,28.0,17424.0,120516.0,528.0,833569.0,3652.0,16.0,220000
2,1057.0,1.0,3.0,1.0,2010.0,68.0,5.0,0.0,1057.0,2.0,...,0.0,5285.0,10.0,0.0,0.0,0.0,1117249.0,2114.0,4.0,109000
3,1444.0,2.0,2.0,4.0,2010.0,73.0,5.0,0.0,384.0,3.0,...,0.0,1920.0,15.0,0.0,0.0,0.0,147456.0,1152.0,9.0,174000
4,1445.0,2.0,2.0,3.0,2010.0,82.0,6.0,0.0,676.0,2.0,...,0.0,4056.0,12.0,0.0,0.0,0.0,456976.0,1352.0,4.0,138500


In [25]:
# Find highly linear relationships to sale price
mask = poly_df.corr()[['SalePrice']].loc[poly_train.get_feature_names(poly_features), :] > .8
keepers = poly_df.corr()[['SalePrice']].loc[poly_train.get_feature_names(poly_features), :][mask].dropna().sort_values('SalePrice', ascending=False)
keepers

Unnamed: 0,SalePrice
Gr Liv Area Overall Qual,0.872971
Gr Liv Area Kitchen Qual,0.850393
Kitchen Qual Overall Qual,0.8307
Overall Qual^2,0.830575
Overall Qual Total Bsmt SF,0.828402
Gr Liv Area Total Bsmt SF,0.820087
Kitchen Qual Total Bsmt SF,0.812357
Overall Qual,0.803462
Yr Sold Overall Qual,0.803439


In [26]:
# Grab column names
poly_cols = keepers.index
poly_cols = poly_cols[:-2]
poly_cols

Index(['Gr Liv Area Overall Qual', 'Gr Liv Area Kitchen Qual',
       'Kitchen Qual Overall Qual', 'Overall Qual^2',
       'Overall Qual Total Bsmt SF', 'Gr Liv Area Total Bsmt SF',
       'Kitchen Qual Total Bsmt SF'],
      dtype='object')

In [27]:
# Select above columns
poly_df = poly_df.loc[:, poly_cols]
poly_df_test = poly_df_test.loc[:, poly_cols]

In [28]:
# Ensure shape makes sense
pd.merge(left=train, right=poly_df, left_index=True, right_index=True).shape

(2049, 223)

In [29]:
train.shape

(2049, 216)

In [30]:
# Add poly features to datasets
train = pd.merge(left=train, right=poly_df, left_index=True, right_index=True)
test = pd.merge(left=test, right=poly_df_test, left_index=True, right_index=True)

In [31]:
# Check that number of columns is the same
print(train.shape)
print(test.shape)

(2049, 223)
(878, 223)


In [32]:
# Ensure columns are ordered consistently
train_cols = [c for c in train.columns if c != 'SalePrice']
test_cols = [c for c in test.columns if c != 'Id']

for c1, c2 in zip(train_cols, test_cols):
    if c1 != c2:
        print(c1,c2)    

## Save engineered dataset

In [33]:
train.to_csv('../datasets/clean/train_engineered.csv', index=False)

In [34]:
test.to_csv('../datasets/clean/test_engineered.csv', index=False)