## Generalised linear model for house price prediction
Use a generalised linear model with a poisson distribution for house price prediction

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load house price data
data_path = '../../data/house_prices/'
trainset = pd.read_csv(data_path + 'train.csv', low_memory=False)

print('No. of training samples: ', len(trainset))

# Get list of numerical features to use
num_features = []
with open('../../data/house_prices/num_features_0.txt', 'r') as fp:
    for item in fp:
        num_features.append(item)

num_features = [x.strip('\n') for x in num_features]
num_features_id = num_features.copy()
num_features_id.append('Id')

# Remove nans in training set
trainset_dropna = trainset[num_features_id].dropna()
print('No. of houses remaining in training set: ', len(trainset_dropna))

# Split data into features and label (price)
Xtrain = trainset_dropna[num_features]
ytrain = trainset.iloc[trainset_dropna.index.values]['SalePrice']

print('No. of features: ', len(Xtrain.columns))

No. of training samples:  1460
No. of houses remaining in training set:  1121
No. of features:  30


In [12]:
# Use scikit PoissonRegressor (GLM)
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_scaled = scaler.transform(Xtrain)

# Fit GLM
glm = PoissonRegressor(max_iter=1000).fit(X=Xtrain_scaled, y=ytrain.values)

In [11]:
Xtrain_scaled

array([[ 0.09229529, -0.23357027, -0.20588518, ..., -0.35662187,
        -1.61534509,  0.15308355],
       [-0.86569565,  0.38483378, -0.0643585 , ..., -0.35662187,
        -0.49871497, -0.59629052],
       [ 0.09229529, -0.10988946,  0.13870153, ..., -0.35662187,
         0.99012519,  0.15308355],
       ...,
       [ 0.33179303, -0.19234334, -0.13302971, ..., -0.35662187,
        -0.49871497,  1.65183171],
       [-0.86569565, -0.10988946, -0.04995969, ...,  1.47180795,
        -0.87092501,  1.65183171],
       [-0.86569565,  0.17869909, -0.02288502, ..., -0.35662187,
        -0.12650493,  0.15308355]])