In [1]:
# Author-Vishal Burman

## Accessing and Reading Data Sets

In [2]:
%matplotlib inline
from mxnet import autograd, gluon, init, nd
from mxnet.gluon import data as gdata, loss as gloss, nn
import numpy as np
import pandas as pd

In [3]:
train_data=pd.read_csv("train.csv")
test_data=pd.read_csv("test.csv")

In [4]:
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [5]:
# The first 4 and the last 2 features as well as label(Saleprice) from first 4 examples:

In [6]:
train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,WD,Normal,208500
1,2,20,RL,80.0,WD,Normal,181500
2,3,60,RL,68.0,WD,Normal,223500
3,4,70,RL,60.0,WD,Abnorml,140000


In [7]:
# Removing the id column from the dataset

In [8]:
all_features=pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

## Data Preprocessing

In [10]:
# We begin be replacing the missing values with mean
# Then we adjust the values to a common scale(with zero mean and unit variance)

\begin{equation}
x \leftarrow \frac{x - \mu}{\sigma}
\end{equation}

In [11]:
numeric_features=all_features.dtypes[all_features.dtypes!='object'].index
all_features[numeric_features]=all_features[numeric_features].apply(lambda x: (x-x.mean())/(x.std()))
# After standardising the data all means vanish, hence we can set the missing values to 0
all_features[numeric_features]=all_features[numeric_features].fillna(0)

In [12]:
# Next we deal with discrete values 
# We replace them with one-hot encoding

In [13]:
# dummy_na=True refers to a missing value being a legal eigen-value
# Creates an indicative feature for it
all_features=pd.get_dummies(all_features, dummy_na=True)
all_features.shape

(2919, 331)

In [14]:
# Via the values attribute we can extract the NumPy format from the Pandas dataframe
# We can then convert it to MxNet's native NDArray representation for training

In [15]:
n_train=train_data.shape[0]
train_features=nd.array(all_features[:n_train].values)
test_features=nd.array(all_features[:n_train].values)
train_labels=nd.array(train_data.SalePrice.values).reshape((-1, 1))

## Training

In [16]:
# We define a simple squared loss model
# It wont be the perfect criteria but provides for a baseline model

In [17]:
loss=gloss.L2Loss()

def get_net():
    net=nn.Sequential()
    net.add(nn.Dense(1))
    net.initialize()
    return net