In [2]:
%matplotlib inline
import d2lzh as d2l
from mxnet import autograd,gluon,nd,init
from mxnet.gluon import data as gdata,loss as gloss,nn
import numpy as np
import pandas as pd

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_data.shape,test_data.shape

((1460, 81), (1459, 80))

In [5]:
train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,WD,Normal,208500
1,2,20,RL,80.0,WD,Normal,181500
2,3,60,RL,68.0,WD,Normal,223500
3,4,70,RL,60.0,WD,Abnorml,140000


In [6]:
train_data.drop(train_data[(train_data['GrLivArea'] > 4000) & (train_data['SalePrice']<200000)].index,inplace=True)

In [7]:
train_data.drop(train_data[(train_data['LotArea'] > 100000) ].index,inplace=True)
train_data.shape

(1454, 81)

In [8]:
train_data.drop(train_data[(train_data['BsmtFinSF1'] > 5000) ].index,inplace=True)
train_data.drop(train_data[(train_data['BsmtFinSF1'] > 1400) ].index,inplace=True)
train_data.drop(train_data[(train_data['MasVnrArea'] > 1200) ].index,inplace=True)
train_data.drop(train_data[(train_data['TotalBsmtSF'] > 5000) ].index,inplace=True)
train_data.drop(train_data[(train_data['1stFlrSF'] > 4000) ].index,inplace=True)
train_data.drop(train_data[(train_data['EnclosedPorch'] > 500) ].index,inplace=True)

In [9]:
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

In [10]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [11]:
all_features = pd.get_dummies(all_features,dummy_na=True)
all_features.shape

(2874, 328)

In [12]:
n_train = train_data.shape[0]
train_features = nd.array(all_features[:n_train].values)
test_features = nd.array(all_features[n_train:].values)
train_labels = nd.array(train_data.SalePrice.values).reshape((-1,1))
train_features.shape,test_features.shape,train_labels.shape

((1415, 328), (1459, 328), (1415, 1))

In [13]:
loss = gloss.L2Loss()

def get_net():
    net = nn.Sequential()
    net.add(nn.Dense(300,activation='relu'),nn.Dense(1))
    net.initialize()
    return net

In [19]:
def log_rmse(net,features,labels):
    clipped_preds = nd.clip(net(features),1,float('inf'))
    rmse = nd.sqrt(2*loss(clipped_preds.log(),labels.log()).mean())
    return rmse.asscalar()

def test_rmse(y_hat,y):
    rmse = nd.sqrt(y_hat,y).mean()
    return rmse.asscalar()
a = nd.array([1,2,3])
b = nd.array([2,3,4])
print(test_rmse(a,b))

1.3820881


In [15]:
def train(net,train_features,train_labels,test_features,test_lables,num_epochs,learning_rate,weight_decay,batch_size):
    train_ls,test_ls = [],[]
    train_iter = gdata.DataLoader(gdata.ArrayDataset(train_features,train_labels),batch_size,shuffle=True)
    trainer = gluon.Trainer(net.collect_params(),'adam',{'learning_rate':learning_rate,'wd':weight_decay})
    for epoch in range(num_epochs):
        for X,y in train_iter:
            with autograd.record():
                l = loss(net(X),y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(log_rmse(net,train_features,train_labels))
        if test_lables is not None:
            test_ls.append(log_rmse(net,test_features,test_lables))
    return train_ls,test_ls

In [16]:
def get_k_fold_data(k,i,X,y):
    assert k>1
    fold_size = X.shape[0] // k
    X_train ,y_train = None,None
    for j in range(k):
        idx = slice(j*fold_size,(j+1)*fold_size)
        X_part,y_part = X[idx,:],y[idx]
        if j==i:
            X_valid,y_valid = X_part,y_part
        elif X_train is None:
            X_train ,y_train = X_part,y_part
        else:
            X_train = nd.concat(X_train,X_part,dim = 0)
            y_train = nd.concat(y_train,y_part,dim=0)
    return X_train,y_train,X_valid,y_valid

In [17]:
def k_fold(k,X_train,y_train,num_epochs,learning_rate,weight_decay,batch_size):
    train_l_sum,valid_l_sum = 0,0
    for i in range(k):
        data = get_k_fold_data(k,i,X_train,y_train)
        net = get_net()
        a,b,c,d = data
        train_ls,valid_ls = train(net,*data,num_epochs,learning_rate,weight_decay,batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        d2l.semilogy(range(1,num_epochs+1),train_ls,'epochs','rmse',range(1,num_epochs+1),valid_ls,['train','valid'])
        print('fold % d ,train rmse %f,valid rmse %f' % (i,train_ls[-1],valid_ls[-1]))
    return train_l_sum/k,valid_l_sum/k

In [18]:
k,num_epochs,lr,weight_decay,batch_size = 5,100,1,0,80
train_l,valid_l = k_fold(k,train_features,train_labels,num_epochs,lr,weight_decay,batch_size)
print('%d-fold validation: avg train rmse %f,avg valid rmse %f' % (k,train_l,valid_l))

KeyboardInterrupt: 

In [None]:
def train_and_pred(train_featrues,test_features,train_labels,test_data,num_epochs,lr,weight_decay,batch_size):
    net = get_net()
    train_ls,_=train(net,train_features,train_labels,None,None,num_epochs,lr,weight_decay,batch_size)
    preds = net(test_features).asnumpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1,-1)[0])
    sumbmission = pd.concat([test_data['Id'],test_data['SalePrice']],axis=1)
    sumbmission.to_csv('submission.csv',index=False)
train_and_pred(train_features,test_features,train_labels,test_data,num_epochs,lr,weight_decay,batch_size)