In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load the dataset
file_path = 'data_proccessed/chungcu_preprocess.csv'
df = pd.read_csv(file_path)

# Drop columns not needed for prediction
df = df.drop(columns=['Link', 'Giá tổng', 'Hướng cửa chính', 'Hướng ban công'])

# Separate target and features
X = df.drop(columns=['Giá/m²'])
y = df['Giá/m²']

In [17]:
y

0        50.46
1        33.00
2        42.06
3        33.82
4        41.67
         ...  
2258    104.49
2259     72.30
2260     56.00
2261     85.53
2262     88.66
Name: Giá/m², Length: 2263, dtype: float64

In [5]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Preprocessing for categorical data: impute missing values and apply one-hot encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical data: impute missing values with mean
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Define the model
model = RandomForestRegressor(random_state=0)

# Create the full pipeline with preprocessing and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                          ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate and print the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 279.9711359708676


In [22]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse



16.732338030618063

In [23]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mae

9.101031383029353

In [25]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
r2

0.819627660755881

In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting xgboost
  Using cached xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 1.4 MB/s eta 0:01:32
   ---------------------------------------- 0.0/124.9 MB 1.4 MB/s eta 0:01:32
   ---------------------------------------- 0.1/124.9 MB 939.4 kB/s eta 0:02:13
   ---------------------------------------- 0.1/124.9 MB 1.1 MB/s eta 0:01:58
   ---------------------------------------- 0.2/124.9 MB 985.7 kB/s eta 0:02:07
   ---------------------------------------- 0.2/124.9 MB 1.0 MB/s eta 0:02:01
   ---------------------------------------- 0.3/124.9 MB 1.0 MB/s eta 0:02:02
   ---------------------------------------- 0.3/124.9 MB 1.0 MB/s eta 0:02:02
   ---------------------------------------- 0.3/124.9 MB 967.8 kB/s eta 0:02:09
   ---------------------------------------- 0.4/124.9 MB 945.2 kB/s eta 

In [23]:
import xgboost as xgb

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Đọc dữ liệu
df = pd.read_csv('data_proccessed/chungcu_preprocess.csv')

# Loại bỏ các cột không cần thiết
df = df.drop(columns=['Link', 'Giá tổng', 'Hướng cửa chính', 'Hướng ban công'])

# One-Hot Encoding cho các cột hạng mục
df = pd.get_dummies(df, columns=df.select_dtypes(include=['object']).columns)

# Tách dữ liệu thành đầu vào (X) và đầu ra (y)
X = df.drop(columns=['Giá/m²'])
y = df['Giá/m²']

# Tách dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Khởi tạo mô hình XGBoost
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Huấn luyện mô hình
model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test)

# Tính toán RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Khởi tạo mô hình XGBoost
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Huấn luyện mô hình
model.fit(X_train, y_train)

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test)

# Tính toán RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)



RMSE: 21.82881292657789
RMSE: 16.592714531421834




In [24]:
from sklearn.metrics import mean_absolute_error, r2_score

print('MSE:', mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print('MAE:', mae)
r2 = r2_score(y_test, y_pred)
print('R-square:', r2)

MSE: 275.3181755212573
MAE: 9.861935642206525
R-square: 0.8226253460629618


In [22]:
y_test

1188    234.78
1294     33.67
963      43.75
98       54.44
1364     40.86
         ...  
475      37.94
1147     46.81
1535     67.96
1129     45.71
2230     77.94
Name: Giá/m², Length: 453, dtype: float64

In [20]:
test = pd.Series.to_numpy(y_test)
test[22]

71.77

In [21]:
y_pred[22]

59.691727