Import libraries

In [1]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow.keras import layers, models, optimizers
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

Load the dataset

In [2]:
# Folder path 
folder_path = 'D:\FPTUni\SP24\ADY201m\Lab05\ADY201m_Lab05_SE183256'
# Import the dataset
train_data = pd.read_csv(os.path.join(folder_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(folder_path, 'test.csv'))
output_path = os.path.join(folder_path, 'submission.csv')

In [3]:
# Save ID of test data
test_id = test_data['Id']

In [4]:
# Drop unnecessary columns
train_data = train_data.drop('Id', axis=1)

House Price Distribution

In [5]:
# Setup threshold for missing values

# === IMPORTANT ===


# Here we will put the variable to choose the threshold
threshold = 0.5


# === IMPORTANT ===

Numerical data distribution

In [6]:
# Because MSSubClass is int64 but it is a categorical variable, so we need to convert it to string
train_data['MSSubClass'] = train_data['MSSubClass'].astype(str)

In [7]:
# Get the categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
# Delete all categorical columns
train_data = train_data.drop(categorical_cols, axis=1)

In [8]:
# Compute the correlation matrix with SalePrice
corr_matrix = train_data.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)
# ABS of correlation coefficient
# Get the features that have the correlation coefficient with SalePrice greater than the threshold
selected_features = corr_matrix['SalePrice'][corr_matrix['SalePrice'].abs() > threshold].index.tolist()
# Display the selected features

In [9]:
# Create the scaler
scaler = StandardScaler()
scaler2 = StandardScaler()
# Fit the scaler to the train data
scaler.fit(train_data[selected_features])
# Transform the train data
train_data_scaled = scaler.transform(train_data[selected_features])

In [10]:
# Convert test data to same format with train data to predict, ( test data doest not have SalePrice column)
# remove saleprice column from selected_features and make new selected_features2
selected_features2 = selected_features.copy()
selected_features2.remove('SalePrice')
# if Id have in test data, we need to remove it, if not, we don't need to remove it
if 'Id' in test_data.columns:
    test_data = test_data.drop('Id', axis=1)
scaler2.fit(test_data[selected_features2])
test_data_scaled = scaler2.transform(test_data[selected_features2])

In [34]:
# Prepare the data
y_train = train_data_scaled[:, -1]
x_train = train_data_scaled[:, :-1]

# Split the train data into train and test data
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

In [44]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(x_train, y_train)
predictions_rf = model_rf.predict(x_test)
mse_rf = mean_squared_error(y_test, predictions_rf)
print("Root Mean Squared Error (Random Forest):", np.sqrt(mse_rf))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x22ecba3c2d0>

In [38]:
# Save model to file
model01.save('model01.house_price_prediction')

INFO:tensorflow:Assets written to: model01.house_price_prediction\assets


INFO:tensorflow:Assets written to: model01.house_price_prediction\assets


In [45]:
# Predict using the trained model
result_scaled = model01.predict(test_data_scaled)

# Combine scaled test data with scaled result
combined_data_scaled = np.concatenate((test_data_scaled, result_scaled), axis=1)
# Unscale the combined data to get the final result
result = scaler.inverse_transform(combined_data_scaled)[:, -1]

# Kiểm tra result có cái nào trống hoặc là số âm không, nếu có thì thay nó bằng 0
result = np.where(result < 0, 0, result)
result = np.where(np.isnan(result), 0, result)



In [46]:
# Create the submission file
submission = pd.DataFrame({'Id': test_id, 'SalePrice': result})
# Save the submission file
submission.to_csv(output_path, index=False)

In [47]:
# Predict using the trained model
result_scaled = model01.predict(x_test)
# Unscale the combined data to get the final result
result = scaler.inverse_transform(np.concatenate((x_test, result_scaled), axis=1))[:, -1]
y_test_unscaled = scaler.inverse_transform(np.concatenate((x_test, y_test.reshape(-1, 1)), axis=1))[:, -1]
# Display random 100 results
for i in range(10):
    print(f'Predicted: {result[i]}, Actual: {y_test_unscaled[i]}')

# Evaluate the model performance using Mean Squared Error
mse = mean_squared_error(y_test_unscaled, result)

# Print the Root Mean Squared Error
print("Root Mean Squared Error:", np.sqrt(mse))

Predicted: 147307.8910161949, Actual: 154500.0
Predicted: 323625.4334910995, Actual: 325000.0
Predicted: 88155.21837856318, Actual: 115000.0
Predicted: 111177.17226127947, Actual: 159000.0
Predicted: 309399.4252983587, Actual: 315500.0
Predicted: 76938.28061563754, Actual: 75500.0
Predicted: 222791.40736765804, Actual: 311500.0
Predicted: 150296.49370439455, Actual: 146000.0
Predicted: 76041.92228463, Actual: 84500.0
Predicted: 117970.02944820898, Actual: 135500.0
Root Mean Squared Error: 32791.48388277687
