## Import Packages

In [95]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
# from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from scipy import stats

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## Data Load and Augmentation

In [96]:
# Load the dataset
file_path = 'Computer_Assisted_Mass_Appraisal_-_Residential.csv'
data = pd.read_csv(file_path)

# Identify categorical columns for One-Hot Encoding
categorical_features = ['HEAT', 'STYLE', 'STRUCT', 'GRADE', 'CNDTN', 'EXTWALL', 'ROOF', 'INTWALL', 'USECODE']

# Apply One-Hot Encoding
onehot_encoder = ColumnTransformer([("onehot", OneHotEncoder(sparse_output=False, handle_unknown='ignore', dtype=int), categorical_features)], remainder='passthrough')
data_encoded = onehot_encoder.fit_transform(data)

# Update column names for one-hot encoded features
encoded_feature_names = onehot_encoder.named_transformers_['onehot'].get_feature_names_out(categorical_features)
data_encoded_df = pd.DataFrame(data_encoded, columns=list(encoded_feature_names) + list(data.columns.drop(categorical_features)))

# Handle zero values in certain columns to prevent division errors in interaction features
data_encoded_df = data_encoded_df.replace({'LANDAREA': {0: np.nan}, 'ROOMS': {0: np.nan}})

# Create interaction features
data_encoded_df['Rooms_Bathrooms'] = data_encoded_df['ROOMS'] * data_encoded_df['BATHRM']
data_encoded_df['Building_Density'] = data_encoded_df['GBA'] / data_encoded_df['LANDAREA']
data_encoded_df['Bedroom_Room_Ratio'] = data_encoded_df['BEDRM'] / data_encoded_df['ROOMS']

# Fill NaN values after division
data_encoded_df = data_encoded_df.fillna({'Building_Density': 0, 'Bedroom_Room_Ratio': 0})

# Handle invalid values
data_encoded_df = data_encoded_df[
    (data_encoded_df['YR_RMDL'] >= 1000) & (data_encoded_df['YR_RMDL'] <= 2023) &
    (data_encoded_df['AYB'] >= 1000) & (data_encoded_df['AYB'] <= 2023) &
    (data_encoded_df['EYB'] >= 1000) & (data_encoded_df['EYB'] <= 2023) &
    (data_encoded_df['GBA'] > 0) & (data_encoded_df['LANDAREA'] > 0) 
]


# Convert SALEDATE to datetime format and derive date-based features
data_encoded_df['SALEDATE'] = pd.to_datetime(data_encoded_df['SALEDATE'])
data_encoded_df['Property_Age'] = 2023 - data_encoded_df['AYB']
data_encoded_df['Years_Since_Remodel'] = 2023 - data_encoded_df['YR_RMDL']
data_encoded_df['Years_Between_Built_and_Remodel'] = data_encoded_df['YR_RMDL'] - data_encoded_df['AYB']
data_encoded_df['Sale_Year'] = data_encoded_df['SALEDATE'].dt.year
data_encoded_df['Sale_Month'] = data_encoded_df['SALEDATE'].dt.month

# Define features for clustering (using numeric features only for simplicity)
features_for_clustering = ['ROOMS', 'BATHRM', 'LANDAREA', 'GBA', 'PRICE']
data_cluster = data_encoded_df[features_for_clustering].dropna()

# Preserve original index for later merge
data_cluster = data_cluster.reset_index()  # This adds the original index as a column

# Normalize features for clustering
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_cluster[features_for_clustering])

# Apply K-Means clustering
n_clusters = 5  # Set an appropriate number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
data_cluster['Cluster'] = kmeans.fit_predict(data_scaled)

# Merge cluster labels back to original data
data_encoded_df = data_encoded_df.merge(data_cluster[['index', 'Cluster']], left_index=True, right_on='index', how='left')
data_encoded_df.drop(columns=['index'], inplace=True)

# Remove rows with NaN in Cluster column for SMOTE application
data_with_clusters = data_encoded_df.dropna(subset=['Cluster'])

# Placeholder for augmented data
augmented_data = pd.DataFrame()

# Apply random oversampling within each cluster
for cluster in data_with_clusters['Cluster'].unique():
    # Select data for the current cluster
    cluster_data = data_with_clusters[data_with_clusters['Cluster'] == cluster]
    
    # Set the desired size for oversampling (e.g., double the original size of each cluster)
    target_size = len(cluster_data) * 2
    
    # Perform random oversampling
    cluster_augmented = resample(cluster_data, replace=True, n_samples=target_size, random_state=42)
    
    # Append to augmented data
    augmented_data = pd.concat([augmented_data, cluster_augmented])

# Combine original data with augmented data
final_data = pd.concat([data_encoded_df, augmented_data], ignore_index=True)

# Check final dataset shape and display a sample
print("Original Data Shape (before encoding):", data.shape)
print("Data Encoded Shape:", data_encoded_df.shape)
print("Augmented Data Shape:", augmented_data.shape)
print("Final Data Shape:", final_data.shape)
final_data.head()
final_data.describe()

Original Data Shape (before encoding): (109034, 39)
Data Encoded Shape: (56567, 176)
Augmented Data Shape: (105890, 176)
Final Data Shape: (162457, 176)


Unnamed: 0,ROOMS,LANDAREA,Building_Density,Bedroom_Room_Ratio,Sale_Year,Sale_Month,Cluster
count,162319.0,162457.0,162457.0,162457.0,162457.0,162457.0,158835.0
mean,7.695291,3382.282943,0.789196,0.480144,2009.942532,6.383172,0.966796
std,2.402255,6489.465822,0.461382,0.142796,24.37918,3.437669,0.94802
min,1.0,216.0,0.017211,0.0,1900.0,1.0,0.0
25%,6.0,1520.0,0.42171,0.4,2009.0,4.0,0.0
50%,7.0,2217.0,0.703422,0.5,2017.0,6.0,1.0
75%,9.0,4263.0,1.05905,0.5,2021.0,9.0,2.0
max,48.0,942632.0,4.988943,8.0,2024.0,12.0,4.0


## Data Preprocessing

In [97]:
data = final_data.copy()

# Step 1: Handle missing values
# Fill missing values in numeric columns with the median
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Fill missing values in categorical columns with the mode
categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Check that missing values have been handled
print("Missing values after processing:", data.isnull().sum().sum())

# Step 2: Standardize numeric features
# Standardize numeric features using StandardScaler
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Step 3: Detect and handle outliers
# # Use Z-score method to detect outliers; replace values with NaN if Z-score > 3
# z_scores = np.abs(stats.zscore(data[numeric_columns]))
# data[numeric_columns] = np.where(z_scores > 3, np.nan, data[numeric_columns])
# # Fill the NaN values resulting from outliers with the median
# data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Step 4: Split the dataset
# Assuming 'PRICE' is the target variable
X = data.drop(columns=['PRICE'])
y = data['PRICE']

# Ensure that only numeric columns are included in X for model training
X_numeric = X.select_dtypes(include=[np.number])

# Split the data into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

# Check the shape of training and test sets
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Missing values after processing: 0
Training set shape: (129965, 162)
Test set shape: (32492, 162)


## 處理有問題的數據

In [98]:
# 檢查每個特徵的數據類型
# print("Data Types for Each Feature:\n", data.dtypes)

# 計算每個數值特徵的描述統計量（包括 Q1, Q2 (中位數), Q3）
numeric_data = data.drop(columns=categorical_columns)
numeric_summary = numeric_data.describe(percentiles=[0.25, 0.5, 0.75]).T  # 使用轉置方便閱讀
numeric_summary['IQR'] = numeric_summary['75%'] - numeric_summary['25%']  # 計算 IQR

# 顯示每個特徵的四分位數信息
print("\nNumeric Features Summary (including Q1, Q2, Q3, IQR):\n", numeric_summary)

# 檢查極端值（異常值）百分比
# 計算 Z-score 超過閾值的異常值比例
outliers_percentage = {}

for col in numeric_data.columns:
    z_scores = stats.zscore(data[col].dropna())  # 排除缺失值，計算 Z-score
    outliers_percentage[col] = (np.abs(z_scores) > 3).mean() * 100  # 計算 Z-score > 3 的異常值比例

outliers_percentage_df = pd.DataFrame.from_dict(outliers_percentage, orient='index', columns=['Outliers (%)'])
print("\nPercentage of Outliers in Each Numeric Feature:\n", outliers_percentage_df)



Numeric Features Summary (including Q1, Q2, Q3, IQR):
                        count          mean       std       min       25%  \
ROOMS               162457.0  3.918860e-17  1.000003 -2.787934 -0.705739   
LANDAREA            162457.0 -2.099389e-17  1.000003 -0.487913 -0.286971   
Building_Density    162457.0 -3.359023e-17  1.000003 -1.673208 -0.796493   
Bedroom_Room_Ratio  162457.0 -6.613077e-17  1.000003 -3.362458 -0.561252   
Sale_Year           162457.0  8.908409e-16  1.000003 -4.509703 -0.038661   
Sale_Month          162457.0  6.210694e-17  1.000003 -1.565942 -0.693254   
Cluster             162457.0  1.119674e-17  1.000003 -1.032146 -1.032146   

                         50%       75%         max       IQR  
ROOMS              -0.289300  0.543578   16.784698  1.249317  
LANDAREA           -0.179566  0.135715  144.734965  0.422686  
Building_Density   -0.185907  0.584884    9.102572  1.381378  
Bedroom_Room_Ratio  0.139049  0.139049   52.661645  0.700301  
Sale_Year           

In [99]:
# 檢查 One-Hot Encoding 特徵的唯一值
one_hot_columns = [col for col in data.columns if 'HEAT_' in col or 'STYLE_' in col or 'STRUCT_' in col]  # 替換為您使用 One-Hot Encoding 的特徵前綴
for col in one_hot_columns:
    print(f"{col} unique values:", data[col].unique())


HEAT_0.0 unique values: [0 1]
HEAT_1.0 unique values: [0 1]
HEAT_2.0 unique values: [0 1]
HEAT_3.0 unique values: [0 1]
HEAT_4.0 unique values: [0 1]
HEAT_5.0 unique values: [0 1]
HEAT_6.0 unique values: [0 1]
HEAT_7.0 unique values: [0 1]
HEAT_8.0 unique values: [1 0]
HEAT_9.0 unique values: [0 1]
HEAT_10.0 unique values: [0 1]
HEAT_11.0 unique values: [0 1]
HEAT_12.0 unique values: [0 1]
HEAT_13.0 unique values: [0 1]
HEAT_nan unique values: [0 1]
STYLE_0.0 unique values: [0 1]
STYLE_1.0 unique values: [0 1]
STYLE_2.0 unique values: [0 1]
STYLE_3.0 unique values: [0 1]
STYLE_4.0 unique values: [0 1]
STYLE_5.0 unique values: [0 1]
STYLE_6.0 unique values: [0 1]
STYLE_7.0 unique values: [0 1]
STYLE_8.0 unique values: [0 1]
STYLE_9.0 unique values: [0 1]
STYLE_10.0 unique values: [1 0]
STYLE_11.0 unique values: [0 1]
STYLE_12.0 unique values: [0 1]
STYLE_13.0 unique values: [0 1]
STYLE_14.0 unique values: [0 1]
STYLE_15.0 unique values: [0 1]
STYLE_94.0 unique values: [0]
STYLE_99.0 uni

In [100]:
data['Years_Since_Remodel'] = data['Years_Since_Remodel'].apply(lambda x: x if x >= 0 else np.nan)
data['Years_Between_Built_and_Remodel'] = data['Years_Between_Built_and_Remodel'].apply(lambda x: x if x >= 0 else np.nan)

# 用中位數填補 NaN
data['Years_Since_Remodel'].fillna(data['Years_Since_Remodel'].median(), inplace=True)
data['Years_Between_Built_and_Remodel'].fillna(data['Years_Between_Built_and_Remodel'].median(), inplace=True)


In [101]:
print("Unique values in Sale_Month:", data['Sale_Month'].unique())
print("Unique values in Cluster:", data['Cluster'].unique())


Unique values in Sale_Month: [ 0.47032823  0.17943255  1.05211959 -0.69325449  1.63391095 -0.11146313
 -1.27504585 -1.56594153 -0.40235881 -0.98415017  1.34301527  0.76122391]
Unique values in Cluster: [ 2.16818699  0.03463184  1.10140941 -1.03214573  3.23496456]


In [102]:
from sklearn.preprocessing import StandardScaler

# 恢復原始數據
data = final_data.copy()  # 使用未經標準化的原始數據進行重新處理

# 填充缺失值
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())
categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# 確認不需要標準化的特徵（例如 Sale_Month 和 Cluster）
exclude_columns = ['Sale_Month', 'Cluster']
numeric_columns_for_scaling = [col for col in numeric_columns if col not in exclude_columns]

# 對需要標準化的數值特徵進行標準化
scaler = StandardScaler()
data[numeric_columns_for_scaling] = scaler.fit_transform(data[numeric_columns_for_scaling])

# 確認 Sale_Month 和 Cluster 列保持原始值
print("Unique values in Sale_Month after processing:", data['Sale_Month'].unique())
print("Unique values in Cluster after processing:", data['Cluster'].unique())


Unique values in Sale_Month after processing: [ 8  7 10  4 12  6  2  1  5  3 11  9]
Unique values in Cluster after processing: [3. 1. 2. 0. 4.]


In [103]:
# 將 Cluster 列轉換為整數型
data['Cluster'] = data['Cluster'].astype(int)

# 檢查轉換後的值
print("Unique values in Cluster after conversion:", data['Cluster'].unique())

Unique values in Cluster after conversion: [3 1 2 0 4]


## 去除極端值

In [104]:
def remove_outliers_iqr(df, columns, iqr_multiplier=1.5):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - iqr_multiplier * IQR
        upper_bound = Q3 + iqr_multiplier * IQR
        df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), np.nan, df[column])
    df = df.dropna()
    return df

# 使用更大的 IQR 邊界（例如 2 或 2.5）去除極端值
data_cleaned = remove_outliers_iqr(data, numeric_columns, iqr_multiplier=1.5)
print("Data shape after adjusting IQR threshold:", data_cleaned.shape)


Data shape after adjusting IQR threshold: (126775, 176)


In [105]:
from scipy import stats
import numpy as np

def remove_outliers_zscore(df, columns, z_threshold=3):
    for column in columns:
        # 計算 Z-score，並保持原始數據的形狀
        z_scores = np.abs(stats.zscore(df[column].fillna(df[column].median())))  # 填充缺失值以便計算 Z-score
        # 將 Z-score 超過閾值的數值標記為 NaN
        df[column] = np.where(z_scores > z_threshold, np.nan, df[column])
    # 刪除包含 NaN 的行
    df = df.dropna()
    return df

# 使用 Z-score 方法去除異常值
data_cleaned = remove_outliers_zscore(data, numeric_columns, z_threshold=3)
print("Data shape after removing outliers with Z-score:", data_cleaned.shape)


Data shape after removing outliers with Z-score: (125429, 176)


In [106]:
# 針對特定特徵使用更嚴格的異常值閾值
important_columns = ['PRICE', 'ROOMS', 'GBA']  # 替換為您認為重要的特徵
data_important_cleaned = remove_outliers_iqr(data, important_columns, iqr_multiplier=1.5)
data_important_cleaned = remove_outliers_zscore(data_important_cleaned, important_columns, z_threshold=2.5)
print("Data shape after stricter outlier removal for key features:", data_important_cleaned.shape)


Data shape after stricter outlier removal for key features: (113135, 176)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = np.where(z_scores > z_threshold, np.nan, df[column])


In [107]:
def quantile_clipping(df, columns, lower_quantile=0.025, upper_quantile=0.975):
    for column in columns:
        lower_bound = df[column].quantile(lower_quantile)
        upper_bound = df[column].quantile(upper_quantile)
        df[column] = np.clip(df[column], lower_bound, upper_bound)
    return df

# 使用分位數截斷法去除異常值
data_clipped = quantile_clipping(data, numeric_columns)
print("Data shape after quantile clipping:", data_clipped.shape)


Data shape after quantile clipping: (162457, 176)


## 處理缺失值（NaN 值）

In [108]:
from sklearn.impute import SimpleImputer

# Step 1: Initial NaN handling for numeric and categorical columns
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# 確認缺失值已處理
print("Missing values after initial processing:", data.isnull().sum().sum())

Missing values after initial processing: 0


## 標準化與異常值處理

In [109]:
from scipy import stats
import numpy as np

# Step 2: Standardize numeric features
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Detect and handle outliers using Z-score with an adjusted threshold
z_scores = np.abs(stats.zscore(data[numeric_columns]))
data[numeric_columns] = np.where(z_scores > 2.5, np.nan, data[numeric_columns])
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

## 分割數據集並進行最終的 NaN 填補

In [110]:
from sklearn.model_selection import train_test_split

# Transform target variable and split dataset
y = np.log1p(data['PRICE'])  # Apply log transformation to target variable
X = data.drop(columns=['PRICE'])

# 確保只有數值列被用於 PCA
X_numeric = X.select_dtypes(include=[np.number])
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

# Fill any remaining NaN values with median in both training and test sets
imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

  result = getattr(ufunc, method)(*inputs, **kwargs)


## 使用 PCA 降維

In [111]:
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensionality
pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("Original feature count:", X_train.shape[1])
print("Reduced feature count with PCA:", X_train_pca.shape[1])

Original feature count: 162
Reduced feature count with PCA: 30


## Check NaN

In [112]:
print("NaN values in y_train:", y_train.isnull().sum())
print("NaN values in y_test:", np.isnan(y_test).sum())
print("NaN values in y_pred_lr:", np.isnan(y_pred_lr).sum())


NaN values in y_train: 36416
NaN values in y_test: 9023
NaN values in y_pred_lr: 0


## Processing NAN

In [113]:
# 處理 y_train 和 y_test 中的 NaN 值
y_train.fillna(y_train.median(), inplace=True)
y_test = y_test.fillna(y_test.median())
print("NaN values in y_train after processing:", y_train.isnull().sum())
print("NaN values in y_test after processing:", np.isnan(y_test).sum())

# 處理 y_pred_lr 中的 NaN 值
y_pred_lr = np.nan_to_num(y_pred_lr, nan=np.mean(y_pred_lr))
print("NaN values in y_pred_lr after processing:", np.isnan(y_pred_lr).sum())

NaN values in y_train after processing: 0
NaN values in y_test after processing: 0
NaN values in y_pred_lr after processing: 0


## 訓練和評估模型

In [114]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train and evaluate Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_pca, y_train)
y_pred_lr = np.expm1(linear_model.predict(X_test_pca))

# Train and evaluate Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_pca, y_train)
y_pred_rf = np.expm1(rf_model.predict(X_test_pca))

# Calculate RMSE and R^2 for both models
rmse_lr = np.sqrt(mean_squared_error(np.expm1(y_test), y_pred_lr))
r2_lr = r2_score(np.expm1(y_test), y_pred_lr)
rmse_rf = np.sqrt(mean_squared_error(np.expm1(y_test), y_pred_rf))
r2_rf = r2_score(np.expm1(y_test), y_pred_rf)

print("Linear Regression RMSE:", rmse_lr, "R^2:", r2_lr)
print("Random Forest RMSE:", rmse_rf, "R^2:", r2_rf)

Linear Regression RMSE: 0.5433593670207059 R^2: 0.330446982360443
Random Forest RMSE: 0.2031341384585862 R^2: 0.9064214119450209
