Import libraries

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import os
from tensorflow.keras import layers, models, optimizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

Load the dataset

In [None]:
# Folder path 
folder_path = 'D:\FPTUni\SP24\ADY201m\Lab05\ADY201m_Lab05_SE183256'
# Import the dataset
train_data = pd.read_csv(os.path.join(folder_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(folder_path, 'test.csv'))
output_path = os.path.join(folder_path, 'submission.csv')

In [None]:
# Save ID of test data
test_id = test_data['Id']

In [None]:
# Drop unnecessary columns
train_data = train_data.drop('Id', axis=1)

House Price Distribution

In [None]:
# Setup threshold for missing values

# === IMPORTANT ===


# Here we will put the variable to choose the threshold
threshold = 0.5


# === IMPORTANT ===

Numerical data distribution

In [None]:
# Because MSSubClass is int64 but it is a categorical variable, so we need to convert it to string
train_data['MSSubClass'] = train_data['MSSubClass'].astype(str)

In [None]:
# Get the categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
# Delete all categorical columns
train_data = train_data.drop(categorical_cols, axis=1)

In [None]:
# Compute the correlation matrix with SalePrice
corr_matrix = train_data.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)
# ABS of correlation coefficient
# Get the features that have the correlation coefficient with SalePrice greater than the threshold
selected_features = corr_matrix['SalePrice'][corr_matrix['SalePrice'].abs() > threshold].index.tolist()
# Display the selected features

In [None]:
# Create the scaler
scaler = StandardScaler()
scaler2 = StandardScaler()
# Fit the scaler to the train data
scaler.fit(train_data[selected_features])
# Transform the train data
train_data_scaled = scaler.transform(train_data[selected_features])

In [None]:
# Convert test data to same format with train data to predict, ( test data doest not have SalePrice column)
# remove saleprice column from selected_features and make new selected_features2
selected_features2 = selected_features.copy()
selected_features2.remove('SalePrice')
# if Id have in test data, we need to remove it, if not, we don't need to remove it
if 'Id' in test_data.columns:
    test_data = test_data.drop('Id', axis=1)
scaler2.fit(test_data[selected_features2])
test_data_scaled = scaler2.transform(test_data[selected_features2])

In [None]:
# Prepare the data
y_train = train_data_scaled[:, -1]
x_train = train_data_scaled[:, :-1]

# Split the train data into train and test data
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=42)

In [None]:
# Create the model
model01 = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(len(selected_features) - 1,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='linear')
])

# Compile the model
model01.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Train the model
model01.fit(x_train, y_train, epochs=40, batch_size=32, validation_data=(x_test, y_test))

In [None]:
# Save model to file
model01.save('model01.house_price_prediction')

In [None]:
# Predict using the trained model
result_scaled = model01.predict(test_data_scaled)

# Combine scaled test data with scaled result
combined_data_scaled = np.concatenate((test_data_scaled, result_scaled), axis=1)
# Unscale the combined data to get the final result
result = scaler.inverse_transform(combined_data_scaled)[:, -1]

# Kiểm tra result có cái nào trống hoặc là số âm không, nếu có thì thay nó bằng 0
result = np.where(result < 0, 0, result)
result = np.where(np.isnan(result), 0, result)

In [None]:
# Create the submission file
submission = pd.DataFrame({'Id': test_id, 'SalePrice': result})
# Save the submission file
submission.to_csv(output_path, index=False)

In [35]:
# Predict using the trained model
result_scaled = model01.predict(x_test)
# Unscale the combined data to get the final result
result = scaler.inverse_transform(np.concatenate((x_test, result_scaled), axis=1))[:, -1]
y_test_unscaled = scaler.inverse_transform(np.concatenate((x_test, y_test.reshape(-1, 1)), axis=1))[:, -1]
# Display random 100 results
for i in range(10):
    print(f'Predicted: {result[i]}, Actual: {y_test_unscaled[i]}')

# Evaluate the model performance using Mean Squared Error
mse = mean_squared_error(y_test_unscaled, result)

# Print the Root Mean Squared Error
print("Root Mean Squared Error:", np.sqrt(mse))

Predicted: 153582.50820421384, Actual: 154500.0
Predicted: 333776.0984796712, Actual: 325000.0
Predicted: 106496.74789546207, Actual: 115000.0
Predicted: 142865.95086824795, Actual: 159000.0
Predicted: 338658.0430036284, Actual: 315500.0
Predicted: 86679.00381538676, Actual: 75500.0
Predicted: 196463.51086481693, Actual: 311500.0
Predicted: 153160.2971308707, Actual: 146000.0
Predicted: 84971.55678738217, Actual: 84500.0
Predicted: 132927.6647332985, Actual: 135500.0
Root Mean Squared Error: 36382.903801297856


In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error

# Convert data to PyTorch tensors and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x_train_tensor = torch.tensor(x_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1).to(device)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32).to(device)

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        self.fc4 = nn.Linear(hidden_size3, hidden_size4)
        self.fc5 = nn.Linear(hidden_size4, output_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))  # Hidden layer 1 with ReLU activation
        x = self.relu(self.fc2(x))  # Hidden layer 2 with ReLU activation
        x = self.relu(self.fc3(x))  # Hidden layer 3 with ReLU activation
        x = self.relu(self.fc4(x))  # Hidden layer 4 with ReLU activation
        x = self.sigmoid(self.fc5(x))  # Output layer with Sigmoid activation
        return x

# Define model hyperparameters
input_size = x_train_tensor.shape[1]
hidden_size1 = 64
hidden_size2 = 32
hidden_size3 = 16
hidden_size4 = 8
output_size = 1

# Create the model instance and move it to GPU
model = MLP(input_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4, output_size).to(device)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
model.train()
for epoch in range(40):
    optimizer.zero_grad()
    outputs = model(x_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/40], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    test_outputs = model(x_test_tensor)
    test_loss = criterion(test_outputs, torch.tensor(y_test, dtype=torch.float32).unsqueeze(1).to(device))
    print(f'Test Loss: {test_loss.item():.4f}')

# Unscale the outputs to get the final result
outputs_unscaled = scaler.inverse_transform(test_outputs.cpu().numpy()).squeeze()

# Evaluate the model performance using Mean Squared Error
mse = mean_squared_error(y_test_unscaled, outputs_unscaled)

# Print the Root Mean Squared Error
print("Root Mean Squared Error:", np.sqrt(mse))


Epoch [1/40], Loss: 1.1603
Epoch [2/40], Loss: 1.1575
Epoch [3/40], Loss: 1.1546
Epoch [4/40], Loss: 1.1513
Epoch [5/40], Loss: 1.1475
Epoch [6/40], Loss: 1.1431
Epoch [7/40], Loss: 1.1381
Epoch [8/40], Loss: 1.1326
Epoch [9/40], Loss: 1.1265
Epoch [10/40], Loss: 1.1201
Epoch [11/40], Loss: 1.1132
Epoch [12/40], Loss: 1.1060
Epoch [13/40], Loss: 1.0983
Epoch [14/40], Loss: 1.0902
Epoch [15/40], Loss: 1.0816
Epoch [16/40], Loss: 1.0726
Epoch [17/40], Loss: 1.0630
Epoch [18/40], Loss: 1.0529
Epoch [19/40], Loss: 1.0422
Epoch [20/40], Loss: 1.0310
Epoch [21/40], Loss: 1.0192
Epoch [22/40], Loss: 1.0069
Epoch [23/40], Loss: 0.9940
Epoch [24/40], Loss: 0.9808
Epoch [25/40], Loss: 0.9671
Epoch [26/40], Loss: 0.9531
Epoch [27/40], Loss: 0.9389
Epoch [28/40], Loss: 0.9246
Epoch [29/40], Loss: 0.9102
Epoch [30/40], Loss: 0.8959
Epoch [31/40], Loss: 0.8819
Epoch [32/40], Loss: 0.8682
Epoch [33/40], Loss: 0.8549
Epoch [34/40], Loss: 0.8421
Epoch [35/40], Loss: 0.8300
Epoch [36/40], Loss: 0.8185
E

ValueError: non-broadcastable output operand with shape (146,1) doesn't match the broadcast shape (146,11)