Import libraries

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

#  Description of libraries version
print(f'Python version: {os.sys.version}')
print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Seaborn version: {sns.__version__}')
print(f'Matplotlib version: {plt.matplotlib.__version__}')

Load the dataset

In [None]:
# Describe folder path / Your should change the folder path when you submit the code

# Just find csv file in the folder
for dirname, _, filenames in os.walk('D:\FPTUni\SP24\ADY201m\Lab05\ADY201m_Lab05_SE183256'):
    for filename in filenames:
        if filename.endswith('.csv'):
            print(os.path.join(dirname, filename))

In [None]:
# Folder path 
folder_path = 'D:\FPTUni\SP24\ADY201m\Lab05\ADY201m_Lab05_SE183256'
output_path = 'D:\FPTUni\SP24\ADY201m\Lab05\ADY201m_Lab05_SE183256\submission.csv'
# Import the dataset
train_data = pd.read_csv(os.path.join(folder_path, 'train.csv'))
test_data = pd.read_csv(os.path.join(folder_path, 'test.csv'))

In [None]:
# Save ID of test data
test_id = test_data['Id']

In [None]:
# Drop unnecessary columns
train_data = train_data.drop('Id', axis=1)

House Price Distribution

In [None]:
# Setup threshold for missing values

# === IMPORTANT ===


# Here we will put the variable to choose the threshold
threshold = 0.5


# === IMPORTANT ===

In [None]:
# Display the SalePrice describe of the train dataset
print(train_data['SalePrice'].describe())
# And use the seaborn library to plot the SalePrice distribution
plt.Figure(figsize=(10, 8))
sns.distplot(train_data['SalePrice'], color='r', bins=100, hist_kws={'alpha': 0.4})

Numerical data distribution

In [None]:
#  Show all the types of the train dataset and count of each type
print(train_data.dtypes.value_counts())
# Because MSSubClass is int64 but it is a categorical variable, so we need to convert it to string
train_data['MSSubClass'] = train_data['MSSubClass'].astype(str)
# After change the type of MSSubClass, we need re-check the type of the train dataset
print("After change the type of MSSubClass:")
print(train_data.dtypes.value_counts())

In [None]:
# Get the categorical columns
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
# Delete all categorical columns
train_data = train_data.drop(categorical_cols, axis=1)

In [None]:
# Compute the correlation matrix with SalePrice
corr_matrix = train_data.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)
# ABS of correlation coefficient
print(corr_matrix['SalePrice'].abs().sort_values(ascending=False))
# Get the features that have the correlation coefficient with SalePrice greater than the threshold
selected_features = corr_matrix['SalePrice'][corr_matrix['SalePrice'].abs() > threshold].index.tolist()
# Display the selected features
print(selected_features)

In [None]:
# Create the scaler
scaler = StandardScaler()
scaler2 = StandardScaler()
# Fit the scaler to the train data
scaler.fit(train_data[selected_features])
# Transform the train data
train_data_scaled = scaler.transform(train_data[selected_features])

In [None]:
# Convert test data to same format with train data to predict, ( test data doest not have SalePrice column)
# remove saleprice column from selected_features and make new selected_features2
selected_features2 = selected_features.copy()
selected_features2.remove('SalePrice')
# if Id have in test data, we need to remove it, if not, we don't need to remove it
if 'Id' in test_data.columns:
    test_data = test_data.drop('Id', axis=1)
scaler2.fit(test_data[selected_features2])
test_data_scaled = scaler2.transform(test_data[selected_features2])

In [None]:
# Prepare the data
y_train = train_data_scaled[:, -1]
x_train = train_data_scaled[:, :-1]

# Split the train data into train and test data
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Create the model
model01 = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(len(selected_features) - 1,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'), 
    layers.Dense(1, activation='linear')
])

# Compile the model
model01.compile(optimizer='adam', loss='mse', metrics=['mae'])
# Train the model
model01.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_test, y_test))

In [None]:
# Predict using the trained model
result_scaled = model01.predict(test_data_scaled)
# Combine scaled test data with scaled result
combined_data_scaled = np.concatenate((test_data_scaled, result_scaled), axis=1)
# Unscale the combined data to get the final result
result = scaler.inverse_transform(combined_data_scaled)[:, -1]

In [None]:
# Create the submission file
submission = pd.DataFrame({'Id': test_id, 'SalePrice': result})
# Save the submission file
submission.to_csv(output_path, index=False)

In [None]:
# Predict using the trained model
result_scaled = model01.predict(x_train)
# Unscale the combined data to get the final result
result = scaler.inverse_transform(np.concatenate((x_train, result_scaled), axis=1))[:, -1]
y_train_unscaled = scaler.inverse_transform(np.concatenate((x_train, y_train.reshape(-1, 1)), axis=1))[:, -1]
# Display random 100 results
for i in range(10):
    print(f'Predicted: {result[i]}, Actual: {y_train_unscaled[i]}')