## Prapare dataset 
1. normalize all the numerical features
2. scatter all the non-numerical features, dummpy the None

In [84]:
%matplotlib inline
from mxnet import autograd,gluon,init,nd
from mxnet.gluon import data as gdata, loss as gloss, nn
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [145]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [146]:
print(train_data.head())
print(test_data.head())
print(train_data.shape)
print(test_data.shape)

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008     

In [121]:
all_dataset = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]),axis=0)
print(all_dataset.shape)
numeraic_features = all_dataset.dtypes[all_dataset.dtypes != "object"].index
all_dataset[numeraic_features] = all_dataset[numeraic_features].apply(func = lambda x : (x - x.mean()) / x.std())
all_dataset = all_dataset.fillna(all_dataset.mean())
print(all_dataset.head())

all_dataset = pd.get_dummies(all_dataset, dummy_na=True)
print(all_dataset.shape)

(2919, 79)
   MSSubClass MSZoning  LotFrontage   LotArea Street Alley LotShape  \
0    0.067320       RL    -0.184443 -0.217841   Pave   NaN      Reg   
1   -0.873466       RL     0.458096 -0.072032   Pave   NaN      Reg   
2    0.067320       RL    -0.055935  0.137173   Pave   NaN      IR1   
3    0.302516       RL    -0.398622 -0.078371   Pave   NaN      IR1   
4    0.067320       RL     0.629439  0.518814   Pave   NaN      IR1   

  LandContour Utilities LotConfig      ...       ScreenPorch  PoolArea PoolQC  \
0         Lvl    AllPub    Inside      ...         -0.285886 -0.063139    NaN   
1         Lvl    AllPub       FR2      ...         -0.285886 -0.063139    NaN   
2         Lvl    AllPub    Inside      ...         -0.285886 -0.063139    NaN   
3         Lvl    AllPub    Corner      ...         -0.285886 -0.063139    NaN   
4         Lvl    AllPub       FR2      ...         -0.285886 -0.063139    NaN   

  Fence MiscFeature   MiscVal    MoSold    YrSold  SaleType  SaleCondition 

In [126]:
n_train = train_data.shape[0]
train_features = nd.array(all_dataset[:n_train].values)
test_features = nd.array(all_dataset[n_train:].values)
train_labels = nd.array(train_data.SalePrice.values).reshape((-1, 1))

## Train model
1. Linear Regression
2. R Square Loss

In [177]:
loss = gloss.L2Loss()

def get_net():
    net = nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize()
    return net

## The Metric of competition

$$\sqrt{\frac{1}{n}\sum_{i=1}^{n}(log(y_{i}) - log(\hat{y_{i}}))^{2}}$$

In [165]:
def log_rmse(net,train_features,train_labels):
    clipped_preds = nd.clip(net(train_features), 1, float('inf'))
    rmse = nd.sqrt(2 * loss(clipped_preds.log(), train_labels.log()).mean())
    return rmse.asscalar()

In [166]:
def train(net, train_features, train_labels, \
          test_features, test_labels, num_epochs, \
          learning_rate, weight_decay, batch_size):
    
    train_ls, test_ls = [0], [0]
    train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features, train_labels),batch_size,shuffle=True)
    trainer = gluon.Trainer(net.collect_params(),'adam',{
'learning_rate': learning_rate, 'wd': weight_decay})
    for epoch in range(num_epochs):
        for X,y in train_iter:
            with autograd.record():
                l = loss(net(X),y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))   
    return train_ls[-1], test_ls[-1]

In [167]:
def k_fold(k,net,X_train,y_train,num_epochs, learning_rate, 
           weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    kf = KFold(n_splits=k)
    index = 0
    for train_index, test_index in kf.split(X_train):
        train_dataset, test_dataset = X_train[train_index], X_train[test_index]
        train_labels, test_labels = y_train[train_index], y_train[test_index]        
        train_ls, test_ls = train(net, train_dataset, train_labels, test_dataset, test_labels,\
                                  num_epochs,learning_rate, weight_decay, batch_size)
        print("fold {}, train rmse: {}, valid rmse: {}".format(index,train_ls,test_ls))
        train_l_sum += train_ls
        valid_l_sum += test_ls
        index += 1

    return train_l_sum / k, valid_l_sum/k      

In [170]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
verbose_epoch = num_epochs - 2
net = get_net()
    
train_l,valid_l = k_fold(k,net,train_features,train_labels,num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse: %f, avg valid rmse: %f' % (k, train_l, valid_l))

fold 0, train rmse: 0.5056030750274658, valid rmse: 0.1413453072309494
fold 1, train rmse: 0.12107884883880615, valid rmse: 0.16791312396526337
fold 2, train rmse: 0.0627867802977562, valid rmse: 0.14256292581558228
fold 3, train rmse: 0.08219567686319351, valid rmse: 0.12313961237668991
fold 4, train rmse: 0.06748398393392563, valid rmse: 0.11682124435901642
5-fold validation: avg train rmse: 0.167830, avg valid rmse: 0.138356


In [181]:
num_epochs, lr, weight_decay, batch_size = 200, 5, 0, 64

def train_and_pred(train_features, test_features, train_labels,\
                   test_data, num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None,
                                 num_epochs, lr, weight_decay, batch_size)
    print('train rmse %f' % train_ls)
    preds = net(test_features).asnumpy()
    test_data['SalePrice'] = pd.Series(preds.reshape((1,-1))[0])
    
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('submission.csv', index=False)

train_and_pred(train_features, test_features, train_labels, test_data,
                        num_epochs, lr, weight_decay, batch_size)

train rmse 0.136127
