In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv")
y_test = pd.read_csv("../data/processed/y_test.csv")

print("X_train shape:", X_train.shape
      , "X_test shape:", X_test.shape
      , "y_train shape:", y_train.shape
      , "y_test shape:", y_test.shape)

# Tạo và huấn luyện mô hình RandomForest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

# Tạo và huấn luyện mô hình XGBoost
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, seed=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

# So sánh kết quả
print("Random Forest MSE:", rf_mse)
print("Random Forest R^2:", rf_r2)
print("XGBoost MSE:", xgb_mse)
print("XGBoost R^2:", xgb_r2)

# Độ quan trọng của các biến
rf_importances = pd.DataFrame(rf_model.feature_importances_,
                              index = X_train.columns,
                              columns=['importance']).sort_values('importance', ascending=False)
xgb_importances = pd.DataFrame(xgb_model.feature_importances_,
                               index = X_train.columns,
                               columns=['importance']).sort_values('importance', ascending=False)

print("\nRandom Forest Feature Importances:")
print(rf_importances)
print("\nXGBoost Feature Importances:")
print(xgb_importances)


X_train shape: (79361, 67) X_test shape: (16229, 67) y_train shape: (79361, 1) y_test shape: (16229, 1)


  return fit_method(estimator, *args, **kwargs)


Random Forest MSE: 39.255515459979044
Random Forest R^2: 0.7215412102678904
XGBoost MSE: 44.98326031327906
XGBoost R^2: 0.6809114826727889

Random Forest Feature Importances:
                    importance
itemid                0.754571
ewm_daily_sales_7d    0.073753
ewm_daily_sales_3d    0.014183
shop_rating           0.008167
daily_sales_lag_7d    0.007646
...                        ...
shopid_175889782      0.000106
shopid_95753434       0.000063
shopid_175753395      0.000051
shopid_37251700       0.000033
shopid_233692311      0.000031

[67 rows x 1 columns]

XGBoost Feature Importances:
                    importance
ewm_daily_sales_7d    0.402871
itemid                0.207258
ewm_daily_sales_3d    0.130873
day_2                 0.013798
is_special_day        0.010788
...                        ...
rating_count          0.001309
shopid_37251700       0.000816
shopid_95753434       0.000108
shopid_175889782      0.000000
shopid_175753395      0.000000

[67 rows x 1 columns]


In [2]:
import pandas as pd
import numpy as np

# Assuming rf_importances and xgb_importances are already defined and contain the importances

# Merge the dataframes on their indexes (features)
combined_importances = pd.merge(rf_importances, xgb_importances, left_index=True, right_index=True, suffixes=('_rf', '_xgb'))

# Scale the importance values by a billion
combined_importances['importance_rf'] = combined_importances['importance_rf'] * 1e9
combined_importances['importance_xgb'] = combined_importances['importance_xgb'] * 1e9

# Rename columns to reflect the scaling
combined_importances.rename(columns={'importance_rf': 'importance_rf_scaled',
                                     'importance_xgb': 'importance_xgb_scaled'}, inplace=True)

# Print the combined and scaled importances
print(combined_importances)


                    importance_rf_scaled  importance_xgb_scaled
itemid                      7.545706e+08           2.072581e+08
ewm_daily_sales_7d          7.375280e+07           4.028711e+08
ewm_daily_sales_3d          1.418263e+07           1.308732e+08
shop_rating                 8.166742e+06           4.328912e+06
daily_sales_lag_7d          7.645679e+06           5.922115e+06
...                                  ...                    ...
shopid_175889782            1.064303e+05           0.000000e+00
shopid_95753434             6.291594e+04           1.084726e+05
shopid_175753395            5.068609e+04           0.000000e+00
shopid_37251700             3.328939e+04           8.156880e+05
shopid_233692311            3.051938e+04           1.335024e+06

[67 rows x 2 columns]


In [3]:
combined_importances.to_csv('../data/processed/feature_importances.csv')