In [12]:
import pandas as pd
from mxnet import nd ,autograd
from mxnet.gluon import nn ,data as gdata,loss as gloss
from mxnet import gluon
import gluonbook as gb

## Dataset

In [13]:
train_data=pd.read_csv('../data/train.csv')
test_data=pd.read_csv('../data/test.csv')

In [14]:
all_features=pd.concat([train_data.iloc[:,1:-1],test_data.iloc[:,1:]],axis=0)

In [15]:
numberic_features=all_features.dtypes[all_features.dtypes!='object'].index
all_features[numberic_features]=all_features[numberic_features].apply(lambda x :(x-x.mean())/x.std())
all_features=all_features.fillna(all_features[numberic_features].mean())
all_features=pd.get_dummies(all_features,dummy_na=True)

In [16]:
n_train=train_data.shape[0]
train_labels=nd.array(train_data['SalePrice'].values)
train_features=nd.array(all_features[:n_train].values)
test_features=nd.array(all_features[n_train:].values)

In [17]:
def get_tain_valid_data(k,i,X,y):
    fold_size=y.shape[0]//k
    train_features,train_labels=None,None
    for j in range(k):
        index=slice(j*fold_size,(j+1)*fold_size)
        feature_part,label_part=X[index,:],y[index]
        if j==i:
            valid_features,valid_labels=feature_part,label_part
        elif train_labels is None:
            train_features,train_labels=feature_part,label_part
        else:
            train_features=nd.concat(train_features,feature_part,dim=0)
            train_labels=nd.concat(train_labels,label_part,dim=0)
    return train_features,train_labels,valid_features,valid_labels

## model

In [18]:
def get_net():
    net=nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize()
    return net

## train & eval (cross validation)

In [19]:
def log_rmse(y_pred,y):
    rmse=nd.sqrt(loss(y_pred.log(),y.log()).mean())
    return rmse 

In [20]:
def train(net,train_features,train_labels,test_features,test_labels,num_epochs,batch_size,lr,weight_dacay):
    train_ls,test_ls=[],[]
    train_iter=gdata.DataLoader(gdata.ArrayDataset(train_features,train_labels),batch_size,shuffle=True)
    trainer=gluon.Trainer(net.collect_params(),'adam',{'learning_rate':lr,'wd':weight_dacay})
    for epoch in range(num_epochs):
        for X,y in train_iter:
            with autograd.record():
                l=loss(net(X),y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(log_rmse(net(train_features),train_labels).asscalar())
        if test_labels is not None:
            test_ls.append(log_rmse(net(test_features),test_labels).asscalar())
    return train_ls,test_ls

In [21]:
def cross_validation(k,train_features,train_labels,num_epochs,batch_size,lr,weight_dacay):
    train_ls_sum,valid_ls_sum=0,0
    for i in range(k):
        data=get_tain_valid_data(k,i,train_features,train_labels)
        net=get_net()
        train_ls,valid_ls=train(net,*data,num_epochs,batch_size,lr,weight_dacay)
        train_ls_sum+=train_ls[-1]
        valid_ls_sum+=valid_ls[-1]
        print('fold %d train rmse: %f valid rmse :%f'%(i,train_ls[-1],valid_ls[-1]))
    return train_ls_sum,valid_ls_sum

In [23]:
k,num_epochs,batch_size,lr,weight_decay=5,100,64,5,0
loss=gloss.L2Loss()
train_ls_sum,valid_ls_sum=cross_validation(k,train_features,train_labels,num_epochs,batch_size,lr,weight_decay)
train_l,valid_l=train_ls_sum/k,valid_ls_sum/k
print('avg train rmse: %f avg valid rmse :%f'%(train_l,valid_l))

fold 0 train rmse: 0.120007 valid rmse :0.110888
fold 1 train rmse: 0.114759 valid rmse :0.133980
fold 2 train rmse: 0.115814 valid rmse :0.118801
fold 3 train rmse: 0.118628 valid rmse :0.109448
fold 4 train rmse: 0.115191 valid rmse :0.129350
avg train rmse: 0.116880 avg valid rmse :0.120493


In [87]:
def prediction(train_features,train_labels,test_features,num_epochs,batch_size,lr,weight_decay):
    net=get_net()
    train_ls,_=train(net,train_features,train_labels,None,None,num_epochs,batch_size,lr,weight_decay)
    print('train rmse is',t                                                              1])
    preds=net(test_features).asnumpy()
    print(preds.shape)
    test_data['SalePrice']=pd.Series(preds[:,0])
    submission=pd.concat([test_data['Id'],test_data['SalePrice']],axis=1)
    submission.to_csv('submission.csv',index=False)
    

In [89]:
prediction(train_features,train_labels,test_features,num_epochs,batch_size,lr,weight_decay)

train rmse is 0.11516944
(1459, 1)
