In [1]:
from __future__ import print_function

import glob
import math
import os

from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer 
import tensorflow as tf

import keras
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, Lambda, MaxPooling2D
from keras import backend as K
from keras import layers
from keras.utils.np_utils import to_categorical
from keras import Model
from keras.preprocessing.text import one_hot
from keras import regularizers

from tensorflow.keras.optimizers import RMSprop
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

Using TensorFlow backend.


In [2]:
train_dataframe = pd.read_csv("/home/tblain/Documents/projet_perso/kaggle_house_price_regression/train.csv")
train_dataframe.describe()
train_dataframe = train_dataframe.reindex(
    np.random.permutation(train_dataframe.index))

In [45]:
def preprocess_features(dataframe):
    """Prepares input features from Ames housing data set.

    Args:
    dataframe: A Pandas DataFrame expected to contain data
      from the Ames housing data set.
    Returns:
    A DataFrame that contains the features to be used for the model, including
    synthetic features.
    """
    processed_features = dataframe.copy()
    
    processed_features['MSZoning'] = encode_and_bind(processed_features, 'MSZoning')
    processed_features['LotFrontage'] = encode_and_bind(processed_features, 'LotFrontage')
    processed_features['Street'] = encode_and_bind(processed_features, 'Street')
    processed_features['Alley'] = encode_and_bind(processed_features, 'Alley')
    processed_features['LotShape'] = encode_and_bind(processed_features, 'LotShape')
    processed_features['LandContour'] = encode_and_bind(processed_features, 'LandContour')
    processed_features['Utilities'] = encode_and_bind(processed_features, 'Utilities')
    processed_features['LotConfig'] = encode_and_bind(processed_features, 'LotConfig')
    processed_features['LandSlope'] = encode_and_bind(processed_features, 'LandSlope')
    processed_features['Neighborhood'] = encode_and_bind(processed_features, 'Neighborhood')
    processed_features['Condition1'] = encode_and_bind(processed_features, 'Condition1')
    processed_features['Condition2'] = encode_and_bind(processed_features, 'Condition2')
    processed_features['BldgType'] = encode_and_bind(processed_features, 'BldgType')
    processed_features['HouseStyle'] = encode_and_bind(processed_features, 'HouseStyle')
    processed_features['RoofStyle'] = encode_and_bind(processed_features, 'RoofStyle')
    processed_features['RoofMatl'] = encode_and_bind(processed_features, 'RoofMatl')
    processed_features['Exterior1st'] = encode_and_bind(processed_features, 'Exterior1st')
    processed_features['Exterior2nd'] = encode_and_bind(processed_features, 'Exterior2nd')
    processed_features['MasVnrType'] = encode_and_bind(processed_features, 'MasVnrType')
    processed_features['ExterQual'] = encode_and_bind(processed_features, 'ExterQual')
    processed_features['ExterCond'] = encode_and_bind(processed_features, 'ExterCond')
    processed_features['Foundation'] = encode_and_bind(processed_features, 'Foundation')
    processed_features['BsmtQual'] = encode_and_bind(processed_features, 'BsmtQual')
    processed_features['BsmtCond'] = encode_and_bind(processed_features, 'BsmtCond')
    processed_features['BsmtExposure'] = encode_and_bind(processed_features, 'BsmtExposure')
    processed_features['BsmtFinType1'] = encode_and_bind(processed_features, 'BsmtFinType1')
    processed_features['BsmtFinType2'] = encode_and_bind(processed_features, 'BsmtFinType2')
    processed_features['Heating'] = encode_and_bind(processed_features, 'Heating')
    processed_features['HeatingQC'] = encode_and_bind(processed_features, 'HeatingQC')
    processed_features['CentralAir'] = encode_and_bind(processed_features, 'CentralAir')
    processed_features['Electrical'] = encode_and_bind(processed_features, 'Electrical')
    processed_features['KitchenQual'] = encode_and_bind(processed_features, 'KitchenQual')
    processed_features['Functional'] = encode_and_bind(processed_features, 'Functional')
    processed_features['FireplaceQu'] = encode_and_bind(processed_features, 'FireplaceQu')
    processed_features['GarageType'] = encode_and_bind(processed_features, 'GarageType')
    processed_features['GarageYrBlt'] = encode_and_bind(processed_features, 'GarageYrBlt')
    processed_features['GarageFinish'] = encode_and_bind(processed_features, 'GarageFinish')
    processed_features['GarageQual'] = encode_and_bind(processed_features, 'GarageQual')
    processed_features['GarageCond'] = encode_and_bind(processed_features, 'GarageCond')
    processed_features['PavedDrive'] = encode_and_bind(processed_features, 'PavedDrive')
    processed_features['PoolQC'] = encode_and_bind(processed_features, 'PoolQC')
    processed_features['Fence'] = encode_and_bind(processed_features, 'Fence')
    processed_features['MiscFeature'] = encode_and_bind(processed_features, 'MiscFeature')
    processed_features['SaleType'] = encode_and_bind(processed_features, 'SaleType')
    processed_features['SaleCondition'] = encode_and_bind(processed_features, 'SaleCondition')
    selected_features = processed_features#[['MSSubClass', 'MSZoning', 'SaleType', 'SaleCondition', 'LotArea', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
                                          # 'Condition1', 'Condition2','BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
                                          #  'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir',
                                          #  'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
                                          #  'PoolQC', 'Fence', 'MiscFeature']]
    selected_features = selected_features.drop('SalePrice')
    # Create a synthetic feature.
#     processed_features["rooms_per_person"] = (
#      dataframe["total_rooms"] /
#      dataframe["population"])
    return selected_features

def preprocess_targets(dataframe):
    """Prepares target features (i.e., labels) from Ames housing data set.

    Args:
    dataframe: A Pandas DataFrame expected to contain data
    from the Ames housing data set.
    Returns:
    A DataFrame that contains the target feature.
    """
    output_targets = pd.DataFrame()
    # Scale the target to be in units of thousands of dollars.
    output_targets["SalePrice"] = (
      dataframe["SalePrice"] / 1000.0)
    return output_targets

In [46]:
def my_one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    return df

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

In [47]:
# Choose the first 12000 (out of 17000) examples for training.
x_train = preprocess_features(train_dataframe.head(1200))
y_train = preprocess_targets(train_dataframe.head(1200))

# Choose the last 5000 (out of 17000) examples for validation.
x_validation = preprocess_features(train_dataframe.tail(260))
y_validation = preprocess_targets(train_dataframe.tail(260))
print(x_train.shape)

x_train.describe()

KeyError: "['SalePrice'] not found in axis"

In [None]:
input = layers.Input(shape=(44,))

x = layers.BatchNormalization()(input)
x = layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01))(input)
x = layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01))(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01))(x)
x = layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01))(x)
x = layers.Dropout(0.2)(x)
# Create output layer with a single node and sigmoid activation
output = layers.Dense(1, kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01))(x)
model = Model(input, output)

In [40]:
model.compile(loss='mean_squared_error',
              optimizer='adam')

In [41]:
batch_size = 64
history = model.fit(x=x_train, y=y_train, steps_per_epoch=1000, epochs=10, \
                              validation_data=(x_validation, y_validation), validation_steps=100)

ValueError: Error when checking input: expected input_7 to have shape (44,) but got array with shape (81,)

In [19]:
X_test = pd.read_csv("/home/tblain/Documents/projet_perso/kaggle_house_price_regression/test.csv")
X_test.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1459.0,1232.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.4,68.6,9819.2,6.1,5.6,1971.4,1983.7,100.7,439.2,...,472.8,93.2,48.3,24.2,1.8,17.1,1.7,58.2,6.1,2007.8
std,421.3,42.7,22.4,4955.5,1.4,1.1,30.4,21.1,177.6,455.3,...,217.0,127.7,68.9,67.2,20.2,56.6,30.5,630.8,2.7,1.3
min,1461.0,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,58.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,67.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,80.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


In [20]:
X_test = preprocess_features(X_test)
X_test.describe()

Unnamed: 0,MSSubClass,LotArea
count,1459.0,1459.0
mean,57.4,9819.2
std,42.7,4955.5
min,20.0,1470.0
25%,20.0,7391.0
50%,50.0,9399.0
75%,70.0,11517.5
max,190.0,56600.0


In [21]:
y_test = model.predict(X_test, verbose=1) * 1000



In [22]:
result = pd.DataFrame(y_test, index=range(1461, len(y_test)+1461), columns=['SalePrice'])

In [23]:
result.to_csv("submission.csv", index_label='Id')

In [167]:
# predict results
results = model.predict(X_test)

# select the indix with the maximum probability
results = np.argmax(results,axis = 1)

results = pd.Series(results,name="Label")

In [18]:
y_test

array([[198249.45],
       [242728.08],
       [230810.27],
       ...,
       [225346.69],
       [162418.9 ],
       [155422.25]], dtype=float32)