# House Pricing Prediction

Dataset: https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview

In [52]:
import numpy as np
import pandas as pd

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

In [65]:
df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
preddf = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
df.head()

## Data Exploration

In [54]:
# Describing numeric variables
df.describe().transpose()

In [55]:
#sns.pairplot(df.iloc[:,1:20], diag_kind='kde')
sns.pairplot(df[['SalePrice', 'YearBuilt', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 'KitchenAbvGr']], diag_kind='kde')

In [66]:
# How much % of the values are null?
tmpdf = df # testdf
print((tmpdf[tmpdf.columns[tmpdf.isnull().any()]].isnull().sum() * 100 / tmpdf.shape[0]).to_string())
print()
print(tmpdf[tmpdf.columns[tmpdf.isnull().any()]].dtypes)

In [57]:
df['MasVnrType'].value_counts()

In [58]:
df.select_dtypes(exclude=np.number).columns

## Preprocessing

In [81]:
def preprocess(df):
    df = df.drop(['Id', 'MiscFeature', 'GarageYrBlt', 'Alley', 'Fence', #'PoolQC',
                 #'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 
                #'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', #'FireplaceQu'
                 ], axis=1)
    
    # some numerical values are enums, map them to string
    df['MSSubClass'] = df['MSSubClass'].astype("object")
    
    # fill N/A values
#     for col in df[df.columns[df.isnull().any()]].isnull().columns:
#         if pd.api.types.is_string_dtype(df[col]):
#             # categorical
#             df[col] = df[col].fillna(df[col].mode()[0])
#         elif pd.api.types.is_numeric_dtype(df[col]):
#             # numerical
#             df[col] = df[col].fillna(df[col].mean())
    
    numerical = df.select_dtypes(exclude=['object']).columns
    categorical = df.select_dtypes(include=['object']).columns
    
    df[numerical] = df[numerical].fillna(0)
    df[categorical] = df[categorical].fillna('NA')
    
    # some categorical data can be directly represented as numbers
    for col in ['ExterQual', 'ExterCond', 'KitchenQual', 'BsmtCond', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
        df[col] = df[col].map({'Ex': 5, 'Gd': 4., 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0})
        df[col] = df[col].astype(np.float32)

#     df['ExterQual'] = df['ExterQual'].map({'Ex': 1, 'Gd': 0.75, 'TA': 0.5, 'Fa': 0.25, 'Po': 0})
#     df['ExterQual'] = df['ExterQual'].astype(np.float32)
#     df['ExterCond'] = df['ExterCond'].map({'Ex': 1, 'Gd': 0.75, 'TA': 0.5, 'Fa': 0.25, 'Po': 0})
#     df['ExterCond'] = df['ExterCond'].astype(np.float32)
#     df['KitchenQual'] = df['KitchenQual'].map({'Ex': 1, 'Gd': 0.75, 'TA': 0.5, 'Fa': 0.25, 'Po': 0})
#     df['KitchenQual'] = df['KitchenQual'].astype(np.float32)
    df['CentralAir'] = df['CentralAir'].map({'N': 0, 'Y': 1})
    df['CentralAir'] = df['CentralAir'].astype(np.uint8)
    
    # categorical values to numerical
    df = pd.get_dummies(df)
    
    # float64 to float32
    for col in df.columns:
        if df[col].dtypes == np.float64:
            df[col] = df[col].astype(np.float32)
    
    #df = df.iloc[:, :30]
    
    return df

In [82]:
# concatinate first, to get all categorical values (test set has more), then only use train data
tmp = preprocess(pd.concat([df.drop(["SalePrice"], axis=1), testdf]))
X = tmp[:df.shape[0]]
X_pred = tmp[df.shape[0]:]
y = df["SalePrice"]
#y = np.log(y)

print(X.shape, X_pred.shape, y.shape)

In [None]:
pd.set_option('display.max_columns', None)
X.head(10)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
X_train, y_train = X, y
#X_train, X_tmp, y_train, y_tmp = train_test_split(X, y, test_size=0.25)
#X_test, X_valid, y_test, y_valid = train_test_split(X_tmp, y_tmp, test_size=0.5)

## Generate Model and Evaluate

In [None]:
model = tf.keras.models.Sequential([
    # preprocessor
    #tf.keras.layers.Normalization(),
    
    tf.keras.layers.Dense(X_train.shape[1], activation='relu', input_dim = X_train.shape[1]),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='linear'),
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss=tf.keras.losses.MeanSquaredError(), metrics=['mae'])

In [None]:
model.fit(X_train, y_train, batch_size=1, epochs=50, validation_split = 0.15)# validation_data=(X_valid, y_valid))

In [None]:
model.evaluate(X_test, y_test)

In [61]:
pred = model.predict(X_test)
exp = np.array(y_test).reshape((y_test.shape[0], 1))
#pred = np.exp(pred)
#y_train = np.exp(y_train)
print(np.c_[pred, exp, (pred - exp)].astype(int)[:20])

In [62]:
predictions = pd.DataFrame(pred)
reality = pd.DataFrame(y_test)


matplotlib.rc('xtick', labelsize=30) 
matplotlib.rc('ytick', labelsize=30) 

fig, ax = plt.subplots(figsize=(50, 40))

plt.style.use('ggplot')
plt.plot(predictions.values, reality.values, 'ro')
plt.xlabel('Predictions', fontsize = 30)
plt.ylabel('Reality', fontsize = 30)
plt.title('Predictions x Reality on dataset Test', fontsize = 30)
ax.plot([reality.min(), reality.max()], [reality.min(), reality.max()], 'k--', lw=4)
plt.show()

In [63]:
# Save model
!mkdir -p saved_model
model.save('saved_model/my_model')


#import pickle
#pickle.dump(model, open('finalized_model.pkl', 'wb'))
# Load with
#loaded_model = pickle.load(open('finalized_model.pkl', 'rb'))
#result = loaded_model.score(X_test, y_test)

In [80]:
X_pred

In [89]:
submission = pd.DataFrame()
submission["Id"] = preddf["Id"]
submission["SalePrice"] = model.predict(X_pred)
submission.to_csv("./submission.csv", index = False)
submission.head(n = 10)