Import libraries

In [19]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow.keras import layers, models, optimizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

Load the dataset

In [20]:
# Folder path 
folder_path = 'D:\FPTUni\SP24\ADY201m\Lab05\ADY201m_Lab05_SE183256'
# Import the dataset
train_data = pd.read_csv(os.path.join(folder_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(folder_path, 'test.csv'))
output_path = os.path.join(folder_path, 'submission.csv')

In [21]:
# Save ID of test data
test_id = test_data['Id']

In [22]:
# Drop unnecessary columns
train_data = train_data.drop('Id', axis=1)

House Price Distribution

In [23]:
# Setup threshold for missing values

# === IMPORTANT ===


# Here we will put the variable to choose the threshold
threshold = 0.5


# === IMPORTANT ===

Numerical data distribution

In [24]:
# Because MSSubClass is int64 but it is a categorical variable, so we need to convert it to string
train_data['MSSubClass'] = train_data['MSSubClass'].astype(str)

In [25]:
# Get the categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
# Delete all categorical columns
train_data = train_data.drop(categorical_cols, axis=1)

In [26]:
# Compute the correlation matrix with SalePrice
corr_matrix = train_data.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)
# ABS of correlation coefficient
# Get the features that have the correlation coefficient with SalePrice greater than the threshold
selected_features = corr_matrix['SalePrice'][corr_matrix['SalePrice'].abs() > threshold].index.tolist()
# Display the selected features

In [27]:
# Create the scaler
scaler = StandardScaler()
scaler2 = StandardScaler()
# Fit the scaler to the train data
scaler.fit(train_data[selected_features])
# Transform the train data
train_data_scaled = scaler.transform(train_data[selected_features])

In [28]:
# Convert test data to same format with train data to predict, ( test data doest not have SalePrice column)
# remove saleprice column from selected_features and make new selected_features2
selected_features2 = selected_features.copy()
selected_features2.remove('SalePrice')
# if Id have in test data, we need to remove it, if not, we don't need to remove it
if 'Id' in test_data.columns:
    test_data = test_data.drop('Id', axis=1)
scaler2.fit(test_data[selected_features2])
test_data_scaled = scaler2.transform(test_data[selected_features2])

In [29]:
# Prepare the data
y_train = train_data_scaled[:, -1]
x_train = train_data_scaled[:, :-1]

# Split the train data into train and test data
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [30]:
# Create the model
model01 = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(len(selected_features) - 1,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'), 
    layers.Dense(1, activation='linear')
])

# Compile the model
model01.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Train the model
model01.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x1a90e0ce550>

In [31]:
# Save model to file
# model01.save('model01.house_price_prediction')

In [32]:
# Predict using the trained model
result_scaled = model01.predict(test_data_scaled)
# Combine scaled test data with scaled result
combined_data_scaled = np.concatenate((test_data_scaled, result_scaled), axis=1)
# Unscale the combined data to get the final result
result = scaler.inverse_transform(combined_data_scaled)[:, -1]



In [33]:
# Create the submission file
submission = pd.DataFrame({'Id': test_id, 'SalePrice': result})
# Save the submission file
submission.to_csv(output_path, index=False)

In [34]:
# Predict using the trained model
result_scaled = model01.predict(x_train)
# Unscale the combined data to get the final result
result = scaler.inverse_transform(np.concatenate((x_train, result_scaled), axis=1))[:, -1]
y_train_unscaled = scaler.inverse_transform(np.concatenate((x_train, y_train.reshape(-1, 1)), axis=1))[:, -1]
# Display random 100 results
for i in range(10):
    print(f'Predicted: {result[i]}, Actual: {y_train_unscaled[i]}')

Predicted: 136833.8934590456, Actual: 145000.0
Predicted: 179954.39197955275, Actual: 178000.0
Predicted: 85088.87235357052, Actual: 85000.0
Predicted: 174461.05843442984, Actual: 175000.0
Predicted: 126077.05386564496, Actual: 127000.0
Predicted: 131981.56183785124, Actual: 149900.0
Predicted: 173659.86752830193, Actual: 174000.0
Predicted: 126698.56507612474, Actual: 125500.0
Predicted: 174695.64282329395, Actual: 175500.0
Predicted: 260852.71602927742, Actual: 225000.0
