## Data Load and Augmentation

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
# from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from sklearn.utils import resample

# Load the dataset
file_path = 'Computer_Assisted_Mass_Appraisal_-_Residential.csv'
data = pd.read_csv(file_path)

# Identify categorical columns for One-Hot Encoding
categorical_features = ['HEAT', 'STYLE', 'STRUCT', 'GRADE', 'CNDTN', 'EXTWALL', 'ROOF', 'INTWALL', 'USECODE']

# Apply One-Hot Encoding
onehot_encoder = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_features)], remainder='passthrough')
data_encoded = onehot_encoder.fit_transform(data)

# Update column names for one-hot encoded features
encoded_feature_names = onehot_encoder.named_transformers_['onehot'].get_feature_names_out(categorical_features)
data_encoded_df = pd.DataFrame(data_encoded, columns=list(encoded_feature_names) + list(data.columns.drop(categorical_features)))

# Handle zero values in certain columns to prevent division errors in interaction features
data_encoded_df['LANDAREA'].replace(0, np.nan, inplace=True)
data_encoded_df['ROOMS'].replace(0, np.nan, inplace=True)

# Create interaction features
data_encoded_df['Rooms_Bathrooms'] = data_encoded_df['ROOMS'] * data_encoded_df['BATHRM']
data_encoded_df['Building_Density'] = data_encoded_df['GBA'] / data_encoded_df['LANDAREA']
data_encoded_df['Bedroom_Room_Ratio'] = data_encoded_df['BEDRM'] / data_encoded_df['ROOMS']

# Fill NaN values after division
data_encoded_df['Building_Density'].fillna(0, inplace=True)
data_encoded_df['Bedroom_Room_Ratio'].fillna(0, inplace=True)

# Convert SALEDATE to datetime format and derive date-based features
data_encoded_df['SALEDATE'] = pd.to_datetime(data_encoded_df['SALEDATE'])
data_encoded_df['Property_Age'] = 2023 - data_encoded_df['AYB']
data_encoded_df['Years_Since_Remodel'] = 2023 - data_encoded_df['YR_RMDL']
data_encoded_df['Years_Between_Built_and_Remodel'] = data_encoded_df['YR_RMDL'] - data_encoded_df['AYB']
data_encoded_df['Sale_Year'] = data_encoded_df['SALEDATE'].dt.year
data_encoded_df['Sale_Month'] = data_encoded_df['SALEDATE'].dt.month

# Define features for clustering (using numeric features only for simplicity)
features_for_clustering = ['ROOMS', 'BATHRM', 'LANDAREA', 'GBA', 'PRICE']
data_cluster = data_encoded_df[features_for_clustering].dropna()

# Preserve original index for later merge
data_cluster = data_cluster.reset_index()  # This adds the original index as a column

# Normalize features for clustering
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_cluster[features_for_clustering])

# Apply K-Means clustering
n_clusters = 5  # Set an appropriate number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
data_cluster['Cluster'] = kmeans.fit_predict(data_scaled)

# Merge cluster labels back to original data
data_encoded_df = data_encoded_df.merge(data_cluster[['index', 'Cluster']], left_index=True, right_on='index', how='left')
data_encoded_df.drop(columns=['index'], inplace=True)

# Remove rows with NaN in Cluster column for SMOTE application
data_with_clusters = data_encoded_df.dropna(subset=['Cluster'])

# Placeholder for augmented data
augmented_data = pd.DataFrame()

# Apply random oversampling within each cluster
for cluster in data_with_clusters['Cluster'].unique():
    # Select data for the current cluster
    cluster_data = data_with_clusters[data_with_clusters['Cluster'] == cluster]
    
    # Set the desired size for oversampling (e.g., double the original size of each cluster)
    target_size = len(cluster_data) * 2
    
    # Perform random oversampling
    cluster_augmented = resample(cluster_data, replace=True, n_samples=target_size, random_state=42)
    
    # Append to augmented data
    augmented_data = pd.concat([augmented_data, cluster_augmented])

# Combine original data with augmented data
final_data = pd.concat([data_encoded_df, augmented_data], ignore_index=True)

# Check final dataset shape and display a sample
print("Original Data Shape (before encoding):", data.shape)
print("Data Encoded Shape:", data_encoded_df.shape)
print("Augmented Data Shape:", augmented_data.shape)
print("Final Data Shape:", final_data.shape)
final_data.head()
final_data.describe()

## Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import stats

# Assuming final_data is the augmented dataset
data = final_data.copy()

# Step 1: Handle missing values
# Fill missing values in numeric columns with the median
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Fill missing values in categorical columns with the mode
categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# Check that missing values have been handled
print("Missing values after processing:", data.isnull().sum().sum())

# Step 2: Standardize numeric features
# Standardize numeric features using StandardScaler
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Step 3: Detect and handle outliers
# Use Z-score method to detect outliers; replace values with NaN if Z-score > 3
z_scores = np.abs(stats.zscore(data[numeric_columns]))
data[numeric_columns] = np.where(z_scores > 3, np.nan, data[numeric_columns])
# Fill the NaN values resulting from outliers with the median
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Step 4: Split the dataset
# Assuming 'PRICE' is the target variable
X = data.drop(columns=['PRICE'])
y = data['PRICE']

# Ensure that only numeric columns are included in X for model training
X_numeric = X.select_dtypes(include=[np.number])

# Split the data into training and test sets (80% training, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

# Check the shape of training and test sets
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


## 處理有問題的數據

In [None]:
import pandas as pd

# 檢查每個特徵的數據類型
print("Data Types for Each Feature:\n", data.dtypes)

# 計算每個數值特徵的描述統計量（包括 Q1, Q2 (中位數), Q3）
numeric_summary = data.describe(percentiles=[0.25, 0.5, 0.75]).T  # 使用轉置方便閱讀
numeric_summary['IQR'] = numeric_summary['75%'] - numeric_summary['25%']  # 計算 IQR

# 顯示每個特徵的四分位數信息
print("\nNumeric Features Summary (including Q1, Q2, Q3, IQR):\n", numeric_summary)

# 檢查極端值（異常值）百分比
# 計算 Z-score 超過閾值的異常值比例
from scipy import stats
outliers_percentage = {}

for col in data.select_dtypes(include=[np.number]).columns:
    z_scores = stats.zscore(data[col].dropna())  # 排除缺失值，計算 Z-score
    outliers_percentage[col] = (np.abs(z_scores) > 3).mean() * 100  # 計算 Z-score > 3 的異常值比例

outliers_percentage_df = pd.DataFrame.from_dict(outliers_percentage, orient='index', columns=['Outliers (%)'])
print("\nPercentage of Outliers in Each Numeric Feature:\n", outliers_percentage_df)


In [None]:
# 檢查 One-Hot Encoding 特徵的唯一值
one_hot_columns = [col for col in data.columns if 'HEAT_' in col or 'STYLE_' in col or 'STRUCT_' in col]  # 替換為您使用 One-Hot Encoding 的特徵前綴
for col in one_hot_columns:
    print(f"{col} unique values:", data[col].unique())


In [19]:
data['Years_Since_Remodel'] = data['Years_Since_Remodel'].apply(lambda x: x if x >= 0 else np.nan)
data['Years_Between_Built_and_Remodel'] = data['Years_Between_Built_and_Remodel'].apply(lambda x: x if x >= 0 else np.nan)

# 用中位數填補 NaN
data['Years_Since_Remodel'].fillna(data['Years_Since_Remodel'].median(), inplace=True)
data['Years_Between_Built_and_Remodel'].fillna(data['Years_Between_Built_and_Remodel'].median(), inplace=True)


In [None]:
print("Unique values in Sale_Month:", data['Sale_Month'].unique())
print("Unique values in Cluster:", data['Cluster'].unique())


In [None]:
from sklearn.preprocessing import StandardScaler

# 恢復原始數據
data = final_data.copy()  # 使用未經標準化的原始數據進行重新處理

# 填充缺失值
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())
categorical_columns = data.select_dtypes(exclude=[np.number]).columns
data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])

# 確認不需要標準化的特徵（例如 Sale_Month 和 Cluster）
exclude_columns = ['Sale_Month', 'Cluster']
numeric_columns_for_scaling = [col for col in numeric_columns if col not in exclude_columns]

# 對需要標準化的數值特徵進行標準化
scaler = StandardScaler()
data[numeric_columns_for_scaling] = scaler.fit_transform(data[numeric_columns_for_scaling])

# 確認 Sale_Month 和 Cluster 列保持原始值
print("Unique values in Sale_Month after processing:", data['Sale_Month'].unique())
print("Unique values in Cluster after processing:", data['Cluster'].unique())


In [None]:
# 將 Cluster 列轉換為整數型
data['Cluster'] = data['Cluster'].astype(int)

# 檢查轉換後的值
print("Unique values in Cluster after conversion:", data['Cluster'].unique())

## 去除極端值

In [None]:
def remove_outliers_iqr(df, columns, iqr_multiplier=1.5):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - iqr_multiplier * IQR
        upper_bound = Q3 + iqr_multiplier * IQR
        df[column] = np.where((df[column] < lower_bound) | (df[column] > upper_bound), np.nan, df[column])
    df = df.dropna()
    return df

# 使用更大的 IQR 邊界（例如 2 或 2.5）去除極端值
data_cleaned = remove_outliers_iqr(data, numeric_columns, iqr_multiplier=1.5)
print("Data shape after adjusting IQR threshold:", data_cleaned.shape)


In [None]:
from scipy import stats
import numpy as np

def remove_outliers_zscore(df, columns, z_threshold=3):
    for column in columns:
        # 計算 Z-score，並保持原始數據的形狀
        z_scores = np.abs(stats.zscore(df[column].fillna(df[column].median())))  # 填充缺失值以便計算 Z-score
        # 將 Z-score 超過閾值的數值標記為 NaN
        df[column] = np.where(z_scores > z_threshold, np.nan, df[column])
    # 刪除包含 NaN 的行
    df = df.dropna()
    return df

# 使用 Z-score 方法去除異常值
data_cleaned = remove_outliers_zscore(data, numeric_columns, z_threshold=3)
print("Data shape after removing outliers with Z-score:", data_cleaned.shape)


In [None]:
# 針對特定特徵使用更嚴格的異常值閾值
important_columns = ['PRICE', 'ROOMS', 'GBA']  # 替換為您認為重要的特徵
data_important_cleaned = remove_outliers_iqr(data, important_columns, iqr_multiplier=1.5)
data_important_cleaned = remove_outliers_zscore(data_important_cleaned, important_columns, z_threshold=2.5)
print("Data shape after stricter outlier removal for key features:", data_important_cleaned.shape)


In [None]:
def quantile_clipping(df, columns, lower_quantile=0.025, upper_quantile=0.975):
    for column in columns:
        lower_bound = df[column].quantile(lower_quantile)
        upper_bound = df[column].quantile(upper_quantile)
        df[column] = np.clip(df[column], lower_bound, upper_bound)
    return df

# 使用分位數截斷法去除異常值
data_clipped = quantile_clipping(data, numeric_columns)
print("Data shape after quantile clipping:", data_clipped.shape)


## 將數據分割為訓練集和測試集

In [None]:
# 檢查 data 是否有數據
print("Data shape:", data.shape)

# 確認 'PRICE' 列是否存在
if 'PRICE' not in data.columns:
    print("'PRICE' column not found in data.")
else:
    print("PRICE column found with shape:", data['PRICE'].shape)

# 確認 X 和 y 的形狀
X = data.drop(columns=['PRICE'])
y = data['PRICE']
print("X shape before splitting:", X.shape)
print("y shape before splitting:", y.shape)

# 分割數據集（在確認 X 和 y 有數據之後再執行）
if X.shape[0] > 0 and y.shape[0] > 0:
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Training set shape:", X_train.shape)
    print("Test set shape:", X_test.shape)
else:
    print("X or y has no data, please check your preprocessing steps.")


In [None]:
# 將所有特徵列轉換為數值類型，無法轉換的值設為 NaN
for col in X_train.columns:
    X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

# 檢查轉換後是否還有 NaN 值
nan_counts_train = X_train.isnull().sum()
nan_counts_test = X_test.isnull().sum()

print("Number of NaNs in each column of X_train after conversion:\n", nan_counts_train[nan_counts_train > 0])
print("Number of NaNs in each column of X_test after conversion:\n", nan_counts_test[nan_counts_test > 0])

# 用中位數填補 NaN 值（或根據需要選擇其他填補策略）
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())


In [112]:
# 刪除含有大量 NaN 值的欄位（如果不必要）
columns_to_drop = ['SSL', 'HEAT_D', 'AC', 'STYLE_D', 'STRUCT_D', 'GRADE_D', 'CNDTN_D', 'EXTWALL_D', 'ROOF_D', 
                   'INTWALL_D', 'GIS_LAST_MOD_DTTM', 'Sale_Year', 'Cluster',  'QUALIFIED']  # 根據需要調整列表
X_train = X_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)


In [113]:
# 使用中位數填補 NaN 值
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())


In [None]:
# 確認沒有 NaN 值
print("Number of NaNs in each column of X_train after filling:\n", X_train.isnull().sum().sum())
print("Number of NaNs in each column of X_test after filling:\n", X_test.isnull().sum().sum())


In [None]:
# 查看每個欄位中剩餘的 NaN 數量
print("Remaining NaN values in each column of X_train:\n", X_train.isnull().sum().sort_values(ascending=False).head(20))
print("Remaining NaN values in each column of X_test:\n", X_test.isnull().sum().sort_values(ascending=False).head(20))


In [None]:
import numpy as np

print("Are there any NaN values in X_train?", np.isnan(X_train).any())
print("Are there any NaN values in X_test?", np.isnan(X_test).any())
print("Are there any Infinity values in X_train?", np.isinf(X_train).any())
print("Are there any Infinity values in X_test?", np.isinf(X_test).any())


In [118]:
# 將所有的 Infinity 值替換為 NaN
X_train = np.where(np.isinf(X_train), np.nan, X_train)
X_test = np.where(np.isinf(X_test), np.nan, X_test)

# 用中位數替換 NaN 值
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


In [None]:
print("Maximum value in X_train:", np.nanmax(X_train))
print("Minimum value in X_train:", np.nanmin(X_train))
print("Maximum value in X_test:", np.nanmax(X_test))
print("Minimum value in X_test:", np.nanmin(X_test))


In [120]:
# 將數值限制在一個合理的範圍
X_train = np.clip(X_train, -1e6, 1e6)
X_test = np.clip(X_test, -1e6, 1e6)


In [123]:
import numpy as np

# 逐列檢查 NaN 和 Infinity
for column in range(X_train.shape[1]):
    if np.isnan(X_train[:, column]).any():
        print(f"NaN found in column {column} of X_train")
    if np.isinf(X_train[:, column]).any():
        print(f"Infinity found in column {column} of X_train")

for column in range(X_test.shape[1]):
    if np.isnan(X_test[:, column]).any():
        print(f"NaN found in column {column} of X_test")
    if np.isinf(X_test[:, column]).any():
        print(f"Infinity found in column {column} of X_test")


In [125]:
X_train = np.nan_to_num(X_train, nan=0.0, posinf=0.0, neginf=0.0)
X_test = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)


In [None]:
print("Maximum value in X_train:", np.max(X_train))
print("Minimum value in X_train:", np.min(X_train))
print("Maximum value in X_test:", np.max(X_test))
print("Minimum value in X_test:", np.min(X_test))


In [None]:
import pandas as pd

# 將 X_train 和 X_test 轉回 DataFrame 格式
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

# 檢查數據類型
print("Data types in X_train:\n", X_train.dtypes)
print("Data types in X_test:\n", X_test.dtypes)


In [129]:
# 確認每一列中是否含有非數值的字符
for col in X_train.columns:
    if X_train[col].apply(lambda x: isinstance(x, str)).any():
        print(f"Column {col} contains non-numeric values in X_train")

for col in X_test.columns:
    if X_test[col].apply(lambda x: isinstance(x, str)).any():
        print(f"Column {col} contains non-numeric values in X_test")


In [130]:
# 強制轉換為 float64，無法轉換的設為 NaN
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# 再次用中位數填補 NaN 值
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 訓練線性回歸模型
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# 預測並評估
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Performance:")
print("Mean Squared Error (MSE):", mse_lr)
print("Root Mean Squared Error (RMSE):", rmse_lr)
print("R-squared (R²):", r2_lr)

# 訓練隨機森林模型
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# 預測並評估
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Performance:")
print("Mean Squared Error (MSE):", mse_rf)
print("Root Mean Squared Error (RMSE):", rmse_rf)
print("R-squared (R²):", r2_rf)


In [74]:
import numpy as np

# Step 1: 將 Infinity 值替換為 NaN（適用於 numpy.ndarray）
X_train[np.isinf(X_train)] = np.nan
X_test[np.isinf(X_test)] = np.nan

# Step 2: 使用中位數填補 NaN 值
# 因為是 numpy.ndarray 格式，需計算中位數後再進行填補
# 計算每列的中位數，並將 NaN 值替換為中位數
col_medians_train = np.nanmedian(X_train, axis=0)
col_medians_test = np.nanmedian(X_test, axis=0)

# 使用中位數填補 NaN 值
inds_train = np.where(np.isnan(X_train))
inds_test = np.where(np.isnan(X_test))

X_train[inds_train] = np.take(col_medians_train, inds_train[1])
X_test[inds_test] = np.take(col_medians_test, inds_test[1])


## 再度確認有沒有空值

In [None]:
import numpy as np

# 檢查 NaN 和 Infinity 值
print("Are there any NaN values in X_train?", np.isnan(X_train).any())
print("Are there any NaN values in X_test?", np.isnan(X_test).any())
print("Are there any Infinity values in X_train?", np.isinf(X_train).any())
print("Are there any Infinity values in X_test?", np.isinf(X_test).any())


In [None]:
# 檢查 X_train 和 X_test 中的最大和最小值
print("Maximum value in X_train:", np.nanmax(X_train))
print("Minimum value in X_train:", np.nanmin(X_train))
print("Maximum value in X_test:", np.nanmax(X_test))
print("Minimum value in X_test:", np.nanmin(X_test))

In [None]:
# 檢查所有特徵的數據類型
print("Data Types for Each Feature in X:\n", X.dtypes)

# 找出非數值列
non_numeric_columns = X.select_dtypes(exclude=[np.number]).columns.tolist()
print("Non-numeric columns:", non_numeric_columns)


In [81]:
# 刪除無法轉換為數值的列（如不必要的字符串列）
X = X.drop(columns=non_numeric_columns)


In [82]:
# 將所有列強制轉換為數值類型，無法轉換的值設為 NaN
X = X.apply(pd.to_numeric, errors='coerce')

# 再次檢查是否有 NaN 並填補
X = X.fillna(X.median())


In [None]:
# 確認標準化之前所有列的數據類型
print("Data types in X_train before standardization:\n", pd.DataFrame(X_train).dtypes)
print("Data types in X_test before standardization:\n", pd.DataFrame(X_test).dtypes)


In [94]:
import pandas as pd

# 假設您已經有原始的列名
X_train = pd.DataFrame(X_train, columns= X.columns)
X_test = pd.DataFrame(X_test, columns= X.columns)


In [None]:
# 強制將所有列轉換為數值類型，無法轉換的值設置為 NaN
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# 檢查是否有 NaN 值
print("Number of NaNs in X_train after conversion:\n", X_train.isnull().sum().sum())
print("Number of NaNs in X_test after conversion:\n", X_test.isnull().sum().sum())

# 使用中位數填補 NaN 值
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())


In [None]:
# 確認所有列的數據類型
print("Data types in X_train after conversion:\n", X_train.dtypes)
print("Data types in X_test after conversion:\n", X_test.dtypes)


In [97]:
for col in X_train.columns:
    non_numeric_count = X_train[col].apply(lambda x: isinstance(x, (str, bytes))).sum()
    if non_numeric_count > 0:
        print(f"Column {col} has {non_numeric_count} non-numeric values")


In [98]:
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')


In [None]:
print("Number of NaNs in X_train after conversion:\n", X_train.isnull().sum().sum())
print("Number of NaNs in X_test after conversion:\n", X_test.isnull().sum().sum())


In [100]:
from sklearn.preprocessing import StandardScaler

# 標準化數據
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
# 初始化線性回歸模型
lr_model = LinearRegression()

# 訓練模型
lr_model.fit(X_train, y_train)

# 預測
y_pred_lr = lr_model.predict(X_test)

# 評估模型表現
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Performance:")
print("Mean Squared Error (MSE):", mse_lr)
print("Root Mean Squared Error (RMSE):", rmse_lr)
print("R-squared (R²):", r2_lr)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 初始化並訓練隨機森林模型
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# 在測試集上進行預測
y_pred_rf = rf_model.predict(X_test)

# 計算隨機森林模型的評估指標
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Performance:")
print("Mean Squared Error (MSE):", mse_rf)
print("R-squared (R²):", r2_rf)
