# House Prices: Advanced Regression Techniques

In [1]:
import pandas as pd
import numpy as np

In [42]:
all_train_set = pd.read_csv('data/train.csv')
all_test_set = pd.read_csv('data/test.csv')

In [43]:
all_train_set.iloc[[1,2,3]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000


In [65]:
all_train_set.Alley.unique()

array([nan, 'Grvl', 'Pave'], dtype=object)

## Clean Data

In [98]:
train_set = all_train_set.fillna({
    'LotFrontage': 0.0
})

## Validate a Model

In [100]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction import DictVectorizer

In [116]:
class ColumnSelector(TransformerMixin):
    
    def __init__(self, columns=[]):
        self.columns = columns
        
    def fit(self, X, y, **fit_params):
        return self
        
    def transform(self, X, **transform_params):
        return X[self.columns]
    

class FillNaTransformer(TransformerMixin):
    
    def __init__(self, fill_with):
        self.fill_with = fill_with
        
    def fit(self, X, y, **fit_params):
        return self
    
    def transform(self, X, **transform_params):
        return X.fillna(self.fill_with)
    

class DebugTransformer(TransformerMixin):
    def fit(self, X, y):
        print("fit")
        print(X)
        print(y)
        return self
        
    def transform(self, X, y=None):
        print("transform")
        print(X)
        print(y) 
        return X
    

class FactorExtractor:
    def __init__(self, factor):
        self.factor = factor

    def transform(self, data):
        return [{self.factor: self.normalize(tt)} for tt in data[self.factor]]

    def fit(self, *_):
        return self
    
    def normalize(self, tag):
        if type(tag) != str: tag = '_MISSING_'
        return tag

In [118]:
model = Pipeline([
    ('features', FeatureUnion([
        ('continuous', ColumnSelector(['LotFrontage', 'LotArea'])),
        ('MSSubClass', Pipeline([
            ('extract', ColumnSelector(['MSSubClass'])),
            ('one_hot', OneHotEncoder())
        ])),
        ('Alley', Pipeline([
            ('extract', FactorExtractor('Alley')),
            ('label', DictVectorizer(sparse=False))
        ]))
    ])),
    ('regressor', LinearRegression())
])

kfold = KFold(5)
np.random.seed(0)

for (train_idx, cv_idx) in kfold.split(train_set):
    construct = train_set.iloc[train_idx]
    validate = train_set.iloc[cv_idx]
    
    construct_X = construct
    construct_y = construct['SalePrice']
    
    validate_X = validate
    validate_y = validate['SalePrice']
    
    model.fit(construct_X, y=construct_y)
    predictions = model.predict(validate_X)
    rmse = np.sqrt(mean_squared_error(validate_y, predictions))
    print(rmse)

59179.59863117269
69839.72582287139
71639.09293732487
59103.6426939941
72506.79352328442


In [137]:
model.named_steps['features'].transformer_list[2][1].transform(all_train_set)

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [138]:
all_train_set['Alley']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       NaN
15       NaN
16       NaN
17       NaN
18       NaN
19       NaN
20       NaN
21      Grvl
22       NaN
23       NaN
24       NaN
25       NaN
26       NaN
27       NaN
28       NaN
29       NaN
        ... 
1430     NaN
1431     NaN
1432    Grvl
1433     NaN
1434     NaN
1435     NaN
1436     NaN
1437     NaN
1438     NaN
1439     NaN
1440     NaN
1441     NaN
1442     NaN
1443     NaN
1444     NaN
1445     NaN
1446     NaN
1447     NaN
1448     NaN
1449     NaN
1450     NaN
1451     NaN
1452     NaN
1453     NaN
1454    Pave
1455     NaN
1456     NaN
1457     NaN
1458     NaN
1459     NaN
Name: Alley, Length: 1460, dtype: object