In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import (train_test_split, cross_val_score, 
                                      GridSearchCV, StratifiedKFold)
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report,
                             roc_auc_score, roc_curve, auc)
from sklearn.model_selection import learning_curve
import warnings
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (18, 14)
plt.rcParams['font.size'] = 10

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

try:
    df_raw = pd.read_csv('../data/cleaned/hcmhouseprice_filtered.csv')

    # ===== EDA: tính giá trung bình theo quận =====
    location_value = df_raw.groupby('address')['price'].mean()
    df_raw['location_value'] = df_raw['address'].map(location_value)

    # ===== Tạo feature mới =====
    df_raw['area_location'] = df_raw['area'] * df_raw['location_value']

    le = LabelEncoder()
    df_raw['address_code'] = le.fit_transform(df_raw['address'])

    # ===== Chọn features và target =====
    feature_cols = ['area', 'bedrooms', 'bathrooms', 'area_location', 'address_code']
    X = df_raw[feature_cols].values
    y = df_raw['price'].values

    # ===== Tạo object tương tự Iris =====
    class HcmHousePriceData:
        data = X
        target = y
        feature_names = feature_cols
        target_name = "price"

    price = HcmHousePriceData()

except FileNotFoundError:
    print("CSV not found — kiểm tra lại tên file và đường dẫn!")

# ===== Tạo DataFrame preview =====
df = pd.DataFrame(price.data, columns=price.feature_names)
df["price"] = price.target


Loaded from CSV file
Dataset loaded successfully!
  Total samples: 1433
  Total features: 5
  Target column: price

Dataset Preview:
   area  bedrooms  bathrooms  area_location  address_code  price
0  64.0       1.0        2.0     257.895168          14.0    3.2
1  79.0       2.0        2.0     898.885122           5.0   11.8
2  77.0       2.0        2.0     876.128537           5.0   13.5
3  66.0       2.0        2.0     750.967317           5.0   14.8
4  63.5       2.0        2.0     670.906105           7.0    7.0
5  74.2       2.0        2.0     783.956426           7.0   10.5
6  80.0       2.0        2.0     845.236038           7.0    8.0
7  80.0       2.0        2.0     845.236038           7.0    8.4
8  72.0       2.0        2.0     760.712435           7.0    8.8
9  79.0       2.0        2.0     834.670588           7.0   11.0

Dataset Information:
  Shape: (1433, 6)
  Dtypes:
area             float64
bedrooms         float64
bathrooms        float64
area_location    float64
a

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ===== Feature selection =====
feature_cols = ['area', 'bedrooms', 'bathrooms', 'area_location', 'address_code']
X = df[feature_cols].values
y = df['price'].values

# ===== Feature scaling =====
scaler_standard = StandardScaler()
X_scaled = scaler_standard.fit_transform(X)

# ===== Train-Test Split =====
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

Missing values in dataset: 0

✓ Features scaled using StandardScaler
  • Feature means: [-0.  0.  0. -0. -0.]
  • Feature stds: [1. 1. 0. 1. 1.]

Train-Test Split:
  • Training set: 1003 samples (70.0%)
  • Testing set: 430 samples (30.0%)

Training set price range: 0.017 - 21.5
Testing set price range: 0.946 - 20.5


In [4]:
# MODEL TRAINING & HYPERPARAMETER TUNING
# Define CV strategy


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score


# Define CV strategy
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Define parameter grid
lr_params = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

# Perform GridSearchCV
lr_grid = GridSearchCV(
    LinearRegression(),
    lr_params,
    cv=cv,
    scoring='r2',
    n_jobs=-1
)

# Fit model
lr_grid.fit(X_train, y_train)

# Extract best model
lr_best = lr_grid.best_estimator_

# Display results
print(f"  Best parameters: {lr_grid.best_params_}")
print(f"  Best CV R² Score: {lr_grid.best_score_:.4f}")

# Train final model
lr_best.fit(X_train, y_train)
print("  Linear Regression model trained successfully")

  Best parameters: {'fit_intercept': True, 'positive': False}
  Best CV R² Score: 0.6106
  Linear Regression model trained successfully


In [5]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Predict on train & test sets
y_train_pred = lr_best.predict(X_train)
y_test_pred = lr_best.predict(X_test)

# Evaluate model
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Print performance summary
print(f"Training Performance:")
print(f"  R² Score:   {train_r2:.4f}")
print(f"  MAE:        {train_mae:.2f}")
print(f"  MSE:        {train_mse:.2f}")
print(f"  RMSE:       {train_rmse:.2f}")

print("\nTesting Performance:")
print(f"  R² Score:   {test_r2:.4f}")
print(f"  MAE:        {test_mae:.2f}")
print(f"  MSE:        {test_mse:.2f}")
print(f"  RMSE:       {test_rmse:.2f}")

# Summary table
results = pd.DataFrame({
    'Set': ['Train', 'Test'],
    'R² Score': [train_r2, test_r2],
    'MAE': [train_mae, test_mae],
    'MSE': [train_mse, test_mse],
    'RMSE': [train_rmse, test_rmse]
})
print("\nPerformance Summary Table:")
print(results.round(4))

Training Performance:
  R² Score:   0.6130
  MAE:        1.74
  MSE:        6.94
  RMSE:       2.64

Testing Performance:
  R² Score:   0.5953
  MAE:        1.86
  MSE:        8.00
  RMSE:       2.83

Performance Summary Table:
     Set  R² Score     MAE     MSE    RMSE
0  Train    0.6130  1.7361  6.9448  2.6353
1   Test    0.5953  1.8602  7.9966  2.8278
