source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.cross_validation import cross_val_score


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

In [18]:
def scale_features(X): 
    ''' scale the features columnwise to 0 mean and normalize by range
    '''
    for i in range(len(X[1,:])):
        X[:,i] = (X[:,i] - X[:,i].mean())/(X[:,i].max()-X[:,i].min())
    return X

def fill_nan(X):
    '''replace NaNs with mean for each column
    ''' 
    for i in range(len(X[1,:])):
        mean = np.nanmean(X[:,i])
        mask = np.isnan(X[:,i])
        X[mask,i] = mean
    return X


def strings_to_num(df):
    '''Input: df (Pandas dataframe)
       Ouput: numpy array with categorical (string) columns transformed into numerical
    '''
    #transform dataframe categories to numbers
    return df.apply(lambda x: pd.factorize(x)[0]).values

def df_to_numpy_array(df):
    ''' Seperate categorical and numerical columns of dataframe
        Input df: Pandas dataframe
        Output: numpy array
    '''
    #get names of numerical columns
    num_columns = list(df.select_dtypes(include=['float64', 'int64']).columns.values)
    
    #get numerical values into NumPy array
    num_values = df[num_columns].values
    
    #fill NaN in numerical features
    num_values = fill_nan(num_values)
    
    #scale numerical features
    num_values = scale_features(num_values)   
    
    #get categorical columns 
    cat_columns = list(df.select_dtypes(include=['object']).columns.values)

    #transform categorical columns into numpy array
    cat_values = strings_to_num(df[cat_columns])
    
    return np.concatenate((cat_values,num_values),axis=1)


feature_names = train_df.drop(["SalePrice","Id"],axis=1).columns.tolist()
num_features = len(feature_names)    
    
X = df_to_numpy_array(train_df.drop(["SalePrice","Id"],axis=1))     #training data
Xtest = df_to_numpy_array(test_df.drop(["Id"],axis=1)) #test data
y = train_df['SalePrice'].values #target
test_ids = test_df["Id"].values

print("train_df:\n%r" % (train_df[:10]))

train_df:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   
5   6          50       RL         85.0    14115   Pave   NaN      IR1   
6   7          20       RL         75.0    10084   Pave   NaN      Reg   
7   8          60       RL          NaN    10382   Pave   NaN      IR1   
8   9          50       RM         51.0     6120   Pave   NaN      Reg   
9  10         190       RL         50.0     7420   Pave   NaN      Reg   

  LandContour Utilities    ...     PoolArea PoolQC  Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN    NaN         NaN       0   
1         Lvl

In [6]:
# define base mode
def model():
    # create model
    model = Sequential()
    model.add(Dense(60, input_dim=num_features, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='msle', optimizer='adam')
    return model

In [10]:
# evaluate model with standardized dataset
kr = KerasRegressor(build_fn=model, nb_epoch=100, batch_size=5, verbose=0)

scores = cross_val_score(kr, X, y, cv=4)
print("msle = %4.2f std = %4.2f" % (scores.mean(),scores.std()))

msle = -51.62 std = 1.54


In [22]:
kr.fit(X, y, epochs=100, batch_size=5)
result = kr.predict(X)

In [23]:
print("result.shape:%r" % (result.shape))

result.shape:1460


In [24]:
print("%r" % (result[:10]))
print("%r" % (y[:10]))

array([211225.4 , 170431.81, 230630.6 , 170455.58, 291373.4 , 146089.17,
       277059.84, 225649.31, 143037.62,  99358.1 ], dtype=float32)
array([208500, 181500, 223500, 140000, 250000, 143000, 307000, 200000,
       129900, 118000])
