In [5]:
# Notes
# 1. Use Pandas to pre-process data
# 2. Process special columns i.e. id and label separately from features
# 3. Normalize/transform features and label separately. Transform label
#    simply so that it's easier to restore predicted label in the 
#    original scale
# 4. Test set may have different features in one-hot encoding because
#    string values (vocabulary) in test set is likely a little different
#    from train set. Create new df of train columns to prepare test data

In [6]:
import pandas as pd
import os

# Load data, cut a sandbox fraction. Do NOT run unless
# you want to change the data used.

# data_path = 'data/california-house-prices'
# train_path = os.path.join(data_path, 'train.csv')
# test_path = os.path.join(data_path, 'test.csv')
# sb_train_path = os.path.join(data_path, 'train_sb.csv')
# sb_test_path = os.path.join(data_path, 'test_sb.csv')
# 
# train = pd.read_csv(train_path)
# sb_train = train.sample(frac=0.1)
# test = pd.read_csv(test_path)
# sb_test = test.sample(frac=0.1)
# sb_train.to_csv(sb_train_path)
# sb_test.to_csv(sb_test_path)

In [7]:
# Process data

# Special columns
case = ['Id']
label = ['Sold Price']
specials = case + label
# Sandbox features
features = [
    'Total interior livable area',
    'Tax assessed value',
    'Listed Price',
    'City',
    'State',
]

columns = specials + features

data_path = 'data/california-house-prices'
train_data_path = os.path.join(data_path, 'train.csv')
train_data = pd.read_csv(train_data_path)

# test.cvs is for submission
# sb_test_path = os.path.join(data_path, 'test_sb.csv')
# test = pd.read_csv(sb_test_path)

def fill_mean_(df):
    # Check which column has any NaN value
    for c in df.columns:
        if df[c].dtype not in ['int', 'float']:
            continue
        if df[c].isnull().any(axis=0):
            df[c].fillna(df[c].mean(), inplace=True)
            
def fill_zero_(df):
    # Check which column has any NaN value
    for c in df.columns:
        if df[c].dtype not in ['int', 'float']:
            continue
        if df[c].isnull().any(axis=0):
            df[c].fillna(0, inplace=True)

def zscore_(df):
    for c in df.columns:
        if df[c].dtype not in ['int', 'float']:
            continue
        df[c] = (df[c] - df[c].mean()) / df[c].std()
        
def transform(df, test=False):
    if test:
        fill_zero_(df)
    else:
        fill_mean_(df)
    zscore_(df)
    return pd.get_dummies(df)

# Label in first column
train_specials = train_data.loc[:, specials]
train_features = transform(train_data.loc[:, features])
# col1: Id, col2: Label (Sold Price), col rest: features
train_df = pd.concat([train_specials, train_features], axis=1)
train = train_df.sample(frac=0.8, random_state=1)
test = train_df.drop(train.index)

# Int64Index: 3795 entries, 1642 to 185
# Columns: 501 entries, Sold Price to State_CA
# dtypes: float64(4), uint8(497)
# memory usage: 2.1 MB
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37951 entries, 13470 to 44438
Columns: 936 entries, Id to State_CA
dtypes: float64(4), int64(1), uint8(931)
memory usage: 35.4 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9488 entries, 9 to 47438
Columns: 936 entries, Id to State_CA
dtypes: float64(4), int64(1), uint8(931)
memory usage: 8.9 MB


(None, None)

In [8]:
# Train classes

import torch
from torch.utils import data

class HousePriceDataset(data.Dataset):
    def __init__(self, annotated_df, transform=None, 
                 target_transform=None):
        self.examples = annotated_df
        self.transform = transform
        self.target_transform = target_transform
        
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples.iloc[idx]
        # col1: Id, col2: Label (Sold Price), col rest: features
        X = torch.from_numpy(example[2:].values).float()
        y = torch.from_numpy(example[1:2].values).float()
        
        if self.transform:
            X = self.transform(X)
        if self.target_transform:
            y = self.target_transform(y)
            
        return X, y

import torch.nn as nn
import torch.nn.functional as F

class HousePriceNet(nn.Module):
    def __init__(self):
        super().__init__()
        num_inputs = 934
        num_hiddens1 = 384
        num_outputs = 1
        self.linear1 = nn.Linear(num_inputs, num_hiddens1)
        nn.init.normal_(self.linear1.weight, mean=0, std=0.01)
        nn.init.zeros_(self.linear1.bias)
        self.linear2 = nn.Linear(num_hiddens1, num_outputs)
        nn.init.normal_(self.linear2.weight, mean=0, std=0.01)
        nn.init.zeros_(self.linear2.bias)
        self.relu = nn.ReLU()
        
    def forward(self, X):
        # self.verbose()
        X = self.linear1(X)
        X = self.relu(X)
        return self.linear2(X)
    
    def verbose(self):
        for p in self.linear1.parameters():
            print("linear1 max min\n", p.max(), p.min())
            if p.grad is not None:
                print("linear1 grad max min\n", p.grad.max(), p.grad.min())
        for p in self.linear2.parameters():
            print("linear2 max min\n", p.max(), p.min())
            if p.grad is not None:
                print("linear2 grad max min\n", p.grad.max(), p.grad.min())

In [9]:
# Train process

def train_epoch(net, train_iter, loss, updater):
    train_l = 0
    b = 0
    train_cnt = 0
    for X, y in train_iter:
        y_hat = net(X)
        l = loss(y_hat, y).sum()
        train_l += l
        updater.zero_grad()
        l.backward()
        updater.step()
        b += 1
        train_cnt += y.numel()
        if b % 9 == 0:
            print(f"batch {b}, train loss {train_l/train_cnt:8f}")
        
    return train_l / len(train_iter.dataset)

def validate(net, test_iter, loss):
    test_l = 0
    with torch.no_grad():
        for X, trues in test_iter:
            preds = net(X)
            test_l += loss(preds, trues).sum()

    return test_l / len(test_iter.dataset)

torch.manual_seed(42)

def y_transform(y):
    return y / 1000000

batch_size = 512
train_dataset = HousePriceDataset(train, target_transform=y_transform)
train_iter = data.DataLoader(train_dataset, batch_size, shuffle=True)
test_dataset = HousePriceDataset(test, target_transform=y_transform)
test_iter = data.DataLoader(test_dataset, batch_size, shuffle=False)

net = HousePriceNet()
loss = nn.MSELoss()
lr = 0.008
updater = torch.optim.SGD(net.parameters(), lr)

epoch = 5
for e in range(epoch):
    train_loss = train_epoch(net, train_iter, loss, updater)
    test_loss = validate(net, test_iter, loss)
    print(f"epoch {e}, train loss {train_loss:8f}, test loss {test_loss:8f}")

batch 9, train loss 0.012846
batch 18, train loss 0.009470
batch 27, train loss 0.008276
batch 36, train loss 0.007402
batch 45, train loss 0.007292
batch 54, train loss 0.006570
batch 63, train loss 0.006298
batch 72, train loss 0.006165
epoch 0, train loss 0.006140, test loss 0.003319
batch 9, train loss 0.003058
batch 18, train loss 0.003171
batch 27, train loss 0.003149
batch 36, train loss 0.002650
batch 45, train loss 0.002591
batch 54, train loss 0.002302
batch 63, train loss 0.002587
batch 72, train loss 0.003451
epoch 1, train loss 0.003462, test loss 0.003123
batch 9, train loss 0.004141
batch 18, train loss 0.004804
batch 27, train loss 0.003968
batch 36, train loss 0.003429
batch 45, train loss 0.003014
batch 54, train loss 0.003289
batch 63, train loss 0.003038
batch 72, train loss 0.002942
epoch 2, train loss 0.003018, test loss 0.001992
batch 9, train loss 0.001162
batch 18, train loss 0.001608
batch 27, train loss 0.002416
batch 36, train loss 0.002639
batch 45, train l

In [10]:
# Save model in case kernel crashes
model_path = os.path.join(data_path, 'v1.model')
torch.save(net.state_dict(), model_path)

In [11]:
# Prepared test.csv for submission
sub_data_path = os.path.join(data_path, 'test.csv')
sub_data = pd.read_csv(sub_data_path)

sub_df_specials = sub_data.loc[:, case]
sub_df_specials[label] = 0
# This step transforms string data to one-hot encoding
# Some string value in submission set does not exist in
# training set i.e. this example does not have a feature
# We need to take training features, fill up those the
# test example has, and the rest zero.
sub_df_features = transform(sub_data.loc[:, features])

def prepare(input, cols):
    output = pd.DataFrame(columns=cols)
    for c in output.columns:
        if c in input.columns:
            output[c] = input[c]
        else:
            output[c] = 0
    return output

available_features = train_features.columns
sub_df_features = prepare(sub_df_features, available_features)

sub_df = pd.concat([sub_df_specials, sub_df_features], axis=1)

In [15]:
# Load model for predicting
model_path = os.path.join(data_path, 'v1.model')
net = HousePriceNet()
net.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [12]:
# Do actual precition
sub_batch_size = 1000
sub_dataset = HousePriceDataset(sub_df)
sub_iter = data.DataLoader(sub_dataset, sub_batch_size, shuffle=False)

def predict(net, sub_iter):
    net.eval()
    results = torch.Tensor([])
    for X, _ in sub_iter:
        preds = net(X) * 1000000
        results = torch.cat((results, preds), dim=0)
    return results

preds = predict(net, sub_iter)
preds_df = pd.DataFrame(data=preds.detach().numpy(), columns=['Sold Price'])

submission = pd.concat([sub_df['Id'], preds_df], axis=1)

In [13]:
sub_results_path = os.path.join(data_path, 'v1_results.csv')
submission.to_csv(sub_results_path, index=False)

In [14]:
model_info = '''
Features used.
    'Total interior livable area',
    'Tax assessed value',
    'Listed Price',
    'City',
    'State',
Train data set.
    0.1 frac train.csv used in training.
    0.8 frac training data used for training, 0.2 for validation
    1.0 frac test.csv used in submission.
Loss.
    batch_size 512
    learning rate 0.008
    epochs 5
    epoch 4, train loss 0.002728, test loss 0.001908
Public score. 0.54338
Private score. 0.53771
'''
model_info_path = os.path.join(data_path, 'v1.info')
with open(model_info_path, 'w') as fd:
    fd.write(model_info)