In [1]:
import tensorflow as tf

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline

In [3]:
import pandas as pd
import numpy as np

In [4]:
df_train = pd.read_csv(r"C:\Users\luis_\Documents\GitHub\data-science-projects\house-price-prediction\train.csv")
df_test = pd.read_csv(r"C:\Users\luis_\Documents\GitHub\data-science-projects\house-price-prediction\test.csv")

## Model Improvements

- 

## Data Wrangling

### Setting Correct Datatypes

#### 1) MSSubClass int > str
The "MSSubClass" category uses integers to specify types of dwelling, but these integers simply refer to categories.
We will change the datatype of this column from int to str.

<div style="float: left">

| Value  | Type of Dwelling |
|---|---|
| 20  | 1-STORY 1946 & NEWER ALL STYLES |
|  30 | 1-STORY 1945 & OLDER |
|  40 | 1-STORY W/FINISHED ATTIC ALL AGES |
etc.

</div>

In [5]:
# Transformation 1: int > str
df_train["MSSubClass"] = df_train["MSSubClass"].astype("str")

# Apply same to df_test
df_test["MSSubClass"] = df_test["MSSubClass"].astype("str")

In [6]:
# Check
df_train["MSSubClass"]

0       60
1       20
2       60
3       70
4       60
        ..
1455    60
1456    20
1457    70
1458    20
1459    20
Name: MSSubClass, Length: 1460, dtype: object

#### 2) "Quality" columns str > int

As per the tables below, there are some columns with ordinal category values. However, this ordinality is not captured by the str datatype. We will convert these categories to ints using the mapping shown.

Columns: ExterQual\*, ExterCond\*, BsmtQual, BsmtCond, HeatingQC\*, KitchenQual\*, FireplaceQu, GarageQual, GarageCond, PoolQC <br />
\*Does not contain NA value

<div style="float: left">

| Category  | Ordinal Value  |
|---|---|
| Ex  | 5 |
| Gd  | 4 |
| TA  | 3 |
| Fa  | 2 |
| Po  | 1 |
| NA  | 0 |

Columns: BsmtFinType1, BsmtFinType2

| Category  | Ordinal Value  |
|---|---|
| GLQ  | 6 |
| ALQ  | 5 |
| BLQ  | 4 |
| Rec  | 3 |
| LwQ  | 2 |
| Unf  | 1 |
| NA  | 0 |


Columns: BsmtExposure

| Category  | Ordinal Value  |
|---|---|
| Gd  | 4 |
| Av  | 3 |
| Mn  | 2 |
| No  | 1 |
| NA  | 0 |
</div>

In [7]:
# Transformation 2: str > int
# Needs to have mapping from str values to int values

# Set 1
df_train[["ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC","KitchenQual","FireplaceQu","GarageQual","GarageCond","PoolQC"]] = df_train[["ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC","KitchenQual","FireplaceQu","GarageQual","GarageCond","PoolQC"]].replace(["NA","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
# Set 2
df_train[["BsmtFinType1","BsmtFinType2"]] = df_train[["BsmtFinType1","BsmtFinType2"]].replace(["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],[0,1,2,3,4,5,6])
# Set 3
df_train["BsmtExposure"] = df_train["BsmtExposure"].replace(["Gd","Av","Mn","No","NA"],[4,3,2,1,0])

# Apply same to df_test

# Set 1
df_test[["ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC","KitchenQual","FireplaceQu","GarageQual","GarageCond","PoolQC"]] = df_test[["ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC","KitchenQual","FireplaceQu","GarageQual","GarageCond","PoolQC"]].replace(["NA","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
# Set 2
df_test[["BsmtFinType1","BsmtFinType2"]] = df_test[["BsmtFinType1","BsmtFinType2"]].replace(["NA","Unf","LwQ","Rec","BLQ","ALQ","GLQ"],[0,1,2,3,4,5,6])
# Set 3
df_test["BsmtExposure"] = df_test["BsmtExposure"].replace(["Gd","Av","Mn","No","NA"],[4,3,2,1,0])

#### 3) "Year" columns to "YearsAgo"

This dataset contains columns pertaining to the years in which a property was built and modded. 
Keeping these years as is would not capture much of the relationship between the year column and the dependent variable. Consider the following example:

| Year  | Sale Price  |
|---|---|
| 1950  | 50,000 |
| 2000  | 200,000 |

To a human, we can intuit how a lot changed in these 50 years that would influence the sale price of a property. A model doesn't have this intuition, it just reads some incremental change in the variable. 2000 is just 50 more than 1950.

Note to self: I've changed my mind on this. The line of best fit would just have a y-intercept that's higher. We can test this.

In [8]:
# Transform MoSold into "Fraction of Years Ago Sold"
df_train["MoSold"] = df_train["MoSold"].replace([int(x) for x in np.linspace(1,12,12)],np.linspace(0,1,12))

# Mix columns to include both Years and Months Ago Sold
df_train["YrMoSold"] = df_train["YrSold"] + df_train["MoSold"]

# Apply same to df_test
df_test["MoSold"] = df_test["MoSold"].replace([int(x) for x in np.linspace(1,12,12)],np.linspace(0,1,12))
df_test["YrMoSold"] = df_test["YrSold"] + df_train["MoSold"]

In [9]:
# Check
df_train["YrMoSold"]

0       2008.090909
1       2007.363636
2       2008.727273
3       2006.090909
4       2009.000000
           ...     
1455    2007.636364
1456    2010.090909
1457    2010.363636
1458    2010.272727
1459    2008.454545
Name: YrMoSold, Length: 1460, dtype: float64

In [10]:
# Drop YrSold, but keep MoSold to account for seasonality in sale prices
df_train = df_train.drop("YrSold",axis=1)

# Apply same to df_test
df_test = df_test.drop("YrSold",axis=1)

## Preprocessing

### Numerical values

- Impute missing values using the median value of the column.
- Standardize the scale of each variable by removing the mean and scaling to unit variance (variance = 1).

In [11]:
# preprocessing for numeric columns
imp_median = SimpleImputer(strategy="median", add_indicator=True)
scaler = StandardScaler()

### Categorical variables

- Impute a "missing_value" string where values are missing.
- One hot encode all categorical variables.

In [12]:
# preprocessing for categorical columns
imp_constant = SimpleImputer(strategy="constant")
ohe = OneHotEncoder(handle_unknown="ignore")

### Create pipeline

Separate numerical and categorical variables, create pipeline to implement steps defined in preprocessing.

In [13]:
# select columns by data type
num_cols = make_column_selector(dtype_include="number")
cat_cols = make_column_selector(dtype_exclude="number")

In [14]:
# all preprocessing
preprocessor = make_column_transformer(
    (make_pipeline(imp_median, scaler), num_cols),
    (make_pipeline(imp_constant, ohe), cat_cols))

### Define prediction method

In this case, logistic regression.

In [15]:
# create a pipeline
preprocessing = make_pipeline(preprocessor)

In [16]:
# Define X (independent variables) and y (dependent variable)

X = df_train.drop(columns="SalePrice")
y = df_train["SalePrice"]

In [17]:
preprocessing.fit(X,y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000027C4F9AE788>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='constant')),
                                              

In [18]:
preprocessing.fit_transform(X,y)

array([[-1.73086488, -0.22087509, -0.20714171, ...,  0.        ,
         1.        ,  0.        ],
       [-1.7284922 ,  0.46031974, -0.09188637, ...,  0.        ,
         1.        ,  0.        ],
       [-1.72611953, -0.08463612,  0.07347998, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.72611953, -0.1754621 , -0.14781027, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.7284922 , -0.08463612, -0.08016039, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.73086488,  0.23325479, -0.05811155, ...,  0.        ,
         1.        ,  0.        ]])

In [23]:
df_train

ERROR! Session/line number was not unique in database. History logging moved to new session 120


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,SaleType,SaleCondition,SalePrice,YrMoSold
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,0.090909,WD,Normal,208500,2008.090909
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,0.363636,WD,Normal,181500,2007.363636
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,0.727273,WD,Normal,223500,2008.727273
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,0.090909,WD,Abnorml,140000,2006.090909
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,1.000000,WD,Normal,250000,2009.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,0.636364,WD,Normal,175000,2007.636364
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,0.090909,WD,Normal,210000,2010.090909
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,0.363636,WD,Normal,266500,2010.363636
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,0.272727,WD,Normal,142125,2010.272727


In [21]:
# Taken from Deep Learning with Python: Second Edition 4.25 Model Definition

from tensorflow import keras 
from tensorflow.keras import layers

def build_model():
    model = keras.Sequential([
        layers.Dense(64, activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(1)
    ])
    model.compile(optimizer="rmsprop", loss="mse", metrics=["mae"])
    return model

ERROR! Session/line number was not unique in database. History logging moved to new session 119


In [22]:
# K-fold validation parameters

k = 4
num_val_samples = len(df_train) // k
num_epochs = 100

In [None]:
all_scores = []

for i in range(k):
    print(f"Processing fold #{i}")
    val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
    val_targets = train_targets[i * num_val_samples: (i + 1) * num_val_samples]
    partial_train_data = np.concatenate(
        [train_data[:i * num_val_samples],
         train_data[(i + 1) * num_val_samples:]],
        axis=0)
    partial_train_targets = np.concatenate(
        [train_targets[:i * num_val_samples],
         train_targets[(i + 1) * num_val_samples:]],
        axis=0)
    model = build_model()
    model.fit(partial_train_data, partial_train_targets,
              epochs=num_epochs, batch_size=16, verbose=0)
    val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
    all_scores.append(val_mae)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Processing fold #0
Traceback (most recent call last):
  File "C:\Users\luis_\anaconda3\envs\datascience\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\luis_\AppData\Local\Temp/ipykernel_22356/1597906907.py", line 5, in <module>
    val_data = train_data[i * num_val_samples: (i + 1) * num_val_samples]
NameError: name 'train_data' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\luis_\anaconda3\envs\datascience\lib\site-packages\IPython\core\interactiveshell.py", line 2064, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\luis_\anaconda3\envs\datascience\lib\site-packages\IPython\core\ultratb.py", line 1101

In [21]:
# Define X_test
X_test = df_test

In [22]:
# make predictions based on model
predictions = pipe.predict(X_test)

In [23]:
predictions

array([114368., 129536., 182848., ..., 170432., 110528., 242304.])

In [24]:
len(predictions)

1459

In [25]:
len(df_test)

1459

In [26]:
d = {"Id":df_test["Id"],"SalePrice":predictions}
submission = pd.DataFrame(data=d)
submission

Unnamed: 0,Id,SalePrice
0,1461,114368.0
1,1462,129536.0
2,1463,182848.0
3,1464,185152.0
4,1465,216000.0
...,...,...
1454,2915,75904.0
1455,2916,73024.0
1456,2917,170432.0
1457,2918,110528.0


In [27]:
submission.to_csv("submission.csv",index=False)