### Import Libs

In [332]:
# import libs
import numpy as np
# Set the print options
np.set_printoptions(suppress=True)
import pandas as pd 
from pyproj import CRS
import constant as c
from shapely.wkt import loads
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import warnings; warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestRegressor  # or RandomForestClassifier
# kernel must be msd2
import pandas as pd
import geopandas as gpd
from shapely.geometry import *
import sys
from shapely import wkt
sys.path.append('../')
from util.epsg_transform import *
from util.distance import *
import constant as c
import os
import matplotlib.pyplot as plt

In [333]:
df_grids = pd.read_csv('../asset/preprocess/grid/df_grid_pop_od_drr_ebit.csv', index_col = 0) # grid 기준: 각 grid에 pop, od 데이터가 옆으로 추가된 형태
gdf_cbds = pd.read_csv("../asset/preprocess/grid/gdf_cbds.csv", index_col = 0) # building 기준: 각 building에 feature 데이터가 옆으로 추가된 형태

### Get preprocess bd tval data

In [334]:
df_tval = pd.read_csv("../asset/verification/df_bd_tval.csv", index_col=0)
df_tval['pnu'] = df_tval['pnu'].astype(str)
df_tval = df_tval[df_tval['pnu'].str[:5].isin(c.CBD_CDS)]
df_tval = df_tval[['tyyyymm', 'totalprc', 'pnu', 'x', 'y']] # pnu, 거래가, 거래일
df_tval['tyear'] = df_tval['tyyyymm'] // 100
x_5179, y_5179 = coord_tranformation(list(df_tval.x), list(df_tval.y), "epsg:4326", "epsg:5179")
df_tval['x_5179'] = x_5179
df_tval['y_5179'] = y_5179
df_tval['geometry'] = df_tval.apply(lambda row: Point([row['x_5179'], row['y_5179']]), axis = 1)
df_tval = gpd.GeoDataFrame(df_tval, geometry='geometry', crs = 'EPSG:5179')
df_tval['totalprc'] = df_tval['totalprc'] / 1e6
df_tval.head(2)

Unnamed: 0,tyyyymm,totalprc,pnu,x,y,tyear,x_5179,y_5179,geometry
0,202112,112.706778,1114011800103950000,126.975447,37.555493,2021,953668.907172,1950812.0,POINT (953668.907 1950812.131)
1,202007,93.933417,1114011800108310000,126.974415,37.557838,2020,953579.196053,1951073.0,POINT (953579.196 1951072.894)


In [335]:
# Get df_grid
df_grid = pd.read_csv('../asset/preprocess/grid/df_grid.csv', index_col = 0)
df_grid = df_grid.rename(columns = {'geometry': 'geometry_grids'})
df_grid['geometry_grids'] = df_grid['geometry_grids'].apply(wkt.loads)
df_grid = df_grid[['idx', 'ADM_SECT_C', 'geometry_grids']]
df_grid = df_grid.rename(columns={'idx': 'grid_idx'})
df_grid = gpd.GeoDataFrame(df_grid, crs = 'EPSG:5179', geometry = 'geometry_grids')
df_grid = df_grid.to_crs({'init':'epsg:5179'})
df_grid['grid_idx'] = df_grid['grid_idx'].astype(str)
df_tval_grid = gpd.sjoin(df_tval, df_grid, op = 'within', how = 'left')
df_tval_grid = df_tval_grid.dropna(subset=['grid_idx'])
df_tval_grid['grid_idx'] = df_tval_grid['grid_idx'].astype(int)

### Experiment Settings

In [377]:
# experiment settings
CBD_NM = "scbd"

### Prediction on Transaction Value

In [378]:
crs = 'EPSG:5179'  # Specify the coordinate reference system
tval_cbd = pd.DataFrame()
for YEAR in ['2020', '2021', '2022']:
    fpath = f"../asset/experiment/cbdindex/cbdi_{CBD_NM}_{YEAR}.csv"
    cbdi_df = pd.read_csv(fpath)
    cbdi_df['geometry_grids'] = cbdi_df['geometry_grids'].apply(lambda x: loads(x))
    cbdi_df = gpd.GeoDataFrame(cbdi_df, geometry=cbdi_df['geometry_grids'], crs=crs)
    tval = df_tval_grid[df_tval_grid['tyear'] == int(YEAR)]
    concat_df = pd.merge(tval[['grid_idx', 'totalprc']], cbdi_df, on='grid_idx', suffixes=('', ''), how='left')
    concat_df = concat_df.dropna(subset=['cbdi'])
    tval_cbd = pd.concat([tval_cbd, concat_df], ignore_index = True)
tval_cbd = gpd.GeoDataFrame(tval_cbd, geometry=tval_cbd['geometry_grids'], crs=crs)

In [379]:
# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Reshape the column to a 2D array (required by the scaler)
totalprc = tval_cbd['totalprc'].values.reshape(-1, 1)

# Perform min-max scaling
scaled_totalprc = scaler.fit_transform(totalprc)

# Assign the scaled values back to the DataFrame
tval_cbd['totalprc'] = scaled_totalprc


In [380]:
# Setting COLS
SIM_CAL_COLS = c.SIM_CAL_COLS
STANDARDIZE_COLS = c.STANDARDIZE_COLS
PP_COLS = c.PP_COLS
BS_COLS = c.BS_COLS
BD_COLS = c.BD_COLS

VAR_COLS = [
    'pp_pop', # X1
    'pp_od', # X2
    'pp_drr', # X3
    'bs_ebit', # X4
    'bs_gas', # X5
    'bs_elct', # X6
    'bd_platarea', # X7
    'bd_archarea', # X8
    'bd_totarea', # X9
    'bd_totflrcnt', # X10
    'bd_elvtent', # X11
    'bd_height', # X12
    'bd_vintage', # X13
    'bd_ilp'  # X14
    ]

TARGET_COL = ['totalprc']

In [381]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Define the input features (X) and target variable (y)
X = tval_cbd[VAR_COLS]
y = tval_cbd[TARGET_COL]
print(X.shape)
print(y.shape)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the entire dataset
model.fit(X, y)

# Make predictions on the entire dataset
y_pred = model.predict(X)

# Calculate mean squared error
mse = mean_squared_error(y, y_pred)

# Calculate root mean squared error
rmse = np.sqrt(mse)

# Calculate R-squared score
r2 = r2_score(y, y_pred)

# Print the evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9999) # gbd

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate mean squared error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

# Calculate root mean squared error
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Calculate R-squared score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("Train MSE:", train_mse)
print("Train R2 Score:", train_r2)
print("Train RMSE:", train_rmse)
print("Test MSE:", test_mse)
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)

(14, 14)
(14, 1)
MSE: 1.2040587292050575e-31
RMSE: 3.4699549409251087e-16
R2 Score: 1.0
Train MSE: 9.992950143263772e-32
Train R2 Score: 1.0
Train RMSE: 3.1611627834174834e-16
Test MSE: 0.018039715700114322
Test RMSE: 0.13431200877104893
Test R2 Score: 0.5527729093471243


In [382]:
# Define the input features (X) and target variable (y)
X = tval_cbd[VAR_COLS + ['cbdi']]
y = tval_cbd[TARGET_COL]
print(X.shape)
print(y.shape)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the entire dataset
model.fit(X, y)

# Make predictions on the entire dataset
y_pred = model.predict(X)

# Calculate mean squared error
mse = mean_squared_error(y, y_pred)

# Calculate root mean squared error
rmse = np.sqrt(mse)

# Calculate R-squared score
r2 = r2_score(y, y_pred)

# Print the evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9999) # gbd

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate mean squared error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

# Calculate root mean squared error
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Calculate R-squared score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("Train MSE:", train_mse)
print("Train R2 Score:", train_r2)
print("Train RMSE:", train_rmse)
print("Test MSE:", test_mse)
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)

(14, 15)
(14, 1)
MSE: 2.1166657920938716e-31
RMSE: 4.600723630141102e-16
R2 Score: 1.0
Train MSE: 7.239745745492019e-32
Train R2 Score: 1.0
Train RMSE: 2.690677562528074e-16
Test MSE: 0.015378545120084686
Test RMSE: 0.12401026215634207
Test R2 Score: 0.6187466528374466


### Experiment Settings

In [387]:
# experiment settings
CBD_NM = "gbd"

### Prediction on Transaction Value

In [388]:
crs = 'EPSG:5179'  # Specify the coordinate reference system
tval_cbd = pd.DataFrame()
for YEAR in ['2020', '2021', '2022']:
    fpath = f"../asset/experiment/cbdindex/cbdi_{CBD_NM}_{YEAR}.csv"
    cbdi_df = pd.read_csv(fpath)
    cbdi_df['geometry_grids'] = cbdi_df['geometry_grids'].apply(lambda x: loads(x))
    cbdi_df = gpd.GeoDataFrame(cbdi_df, geometry=cbdi_df['geometry_grids'], crs=crs)
    tval = df_tval_grid[df_tval_grid['tyear'] == int(YEAR)]
    concat_df = pd.merge(tval[['grid_idx', 'totalprc']], cbdi_df, on='grid_idx', suffixes=('', ''), how='left')
    concat_df = concat_df.dropna(subset=['cbdi'])
    tval_cbd = pd.concat([tval_cbd, concat_df], ignore_index = True)
tval_cbd = gpd.GeoDataFrame(tval_cbd, geometry=tval_cbd['geometry_grids'], crs=crs)

In [389]:
# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Reshape the column to a 2D array (required by the scaler)
totalprc = tval_cbd['totalprc'].values.reshape(-1, 1)

# Perform min-max scaling
scaled_totalprc = scaler.fit_transform(totalprc)

# Assign the scaled values back to the DataFrame
tval_cbd['totalprc'] = scaled_totalprc


In [390]:
# Setting COLS
SIM_CAL_COLS = c.SIM_CAL_COLS
STANDARDIZE_COLS = c.STANDARDIZE_COLS
PP_COLS = c.PP_COLS
BS_COLS = c.BS_COLS
BD_COLS = c.BD_COLS

VAR_COLS = [
    'pp_pop', # X1
    'pp_od', # X2
    'pp_drr', # X3
    'bs_ebit', # X4
    'bs_gas', # X5
    'bs_elct', # X6
    'bd_platarea', # X7
    'bd_archarea', # X8
    'bd_totarea', # X9
    'bd_totflrcnt', # X10
    'bd_elvtent', # X11
    'bd_height', # X12
    'bd_vintage', # X13
    'bd_ilp'  # X14
    ]

TARGET_COL = ['totalprc']

In [393]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Define the input features (X) and target variable (y)
X = tval_cbd[VAR_COLS]
y = tval_cbd[TARGET_COL]
print(X.shape)
print(y.shape)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the entire dataset
model.fit(X, y)

# Make predictions on the entire dataset
y_pred = model.predict(X)

# Calculate mean squared error
mse = mean_squared_error(y, y_pred)

# Calculate root mean squared error
rmse = np.sqrt(mse)

# Calculate R-squared score
r2 = r2_score(y, y_pred)

# Print the evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)


# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9999) # gbd

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate mean squared error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

# Calculate root mean squared error
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Calculate R-squared score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("Train MSE:", train_mse)
print("Train R2 Score:", train_r2)
print("Train RMSE:", train_rmse)
print("Test MSE:", test_mse)
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)

(80, 14)
(80, 1)
MSE: 0.007888152556325492
RMSE: 0.08881527208946383
R2 Score: 0.6884032715332026
Train MSE: 0.008320598918948503
Train R2 Score: 0.7055662416185781
Train RMSE: 0.09121731699051723
Test MSE: 0.009030500910678287
Test RMSE: 0.09502894775108418
Test R2 Score: 0.3109926370092616


In [394]:
# Define the input features (X) and target variable (y)
X = tval_cbd[VAR_COLS + ['cbdi']]
y = tval_cbd[TARGET_COL]
print(X.shape)
print(y.shape)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the entire dataset
model.fit(X, y)

# Make predictions on the entire dataset
y_pred = model.predict(X)

# Calculate mean squared error
mse = mean_squared_error(y, y_pred)

# Calculate root mean squared error
rmse = np.sqrt(mse)

# Calculate R-squared score
r2 = r2_score(y, y_pred)

# Print the evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)

# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9999) # gbd

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate mean squared error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

# Calculate root mean squared error
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Calculate R-squared score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("Train MSE:", train_mse)
print("Train R2 Score:", train_r2)
print("Train RMSE:", train_rmse)
print("Test MSE:", test_mse)
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)

(80, 15)
(80, 1)
MSE: 0.007535654757462667
RMSE: 0.086808149142017
R2 Score: 0.7023275915984029
Train MSE: 0.008018383552858735
Train R2 Score: 0.7162604725201287
Train RMSE: 0.08954542731406633
Test MSE: 0.007567585592311113
Test RMSE: 0.0869918708403901
Test R2 Score: 0.42260985910544036


### Prediction on Land Value

In [383]:
VAR_COLS = [
    'pp_pop', # X1
    'pp_od', # X2
    'pp_drr', # X3
    'bs_ebit', # X4
    'bs_gas', # X5
    'bs_elct', # X6
    'bd_platarea', # X7
    'bd_archarea', # X8
    'bd_totarea', # X9
    'bd_totflrcnt', # X10
    'bd_elvtent', # X11
    'bd_height', # X12
    'bd_vintage', # X13
    ]

TARGET_COL = ['bd_ilp']

In [384]:
fpath = f"../asset/experiment/cbdindex/cbdi_{CBD_NM}_{YEAR}.csv"
cbdi_df = pd.read_csv(fpath)
cbdi_df['geometry_grids'] = cbdi_df['geometry_grids'].apply(lambda x: loads(x))
cbdi_df = gpd.GeoDataFrame(cbdi_df, geometry=cbdi_df['geometry_grids'], crs=crs)

concat_df = pd.DataFrame()
for YEAR in ['2020', '2021', '2022']:
    fpath = f"../asset/experiment/cbdindex/cbdi_{CBD_NM}_{YEAR}.csv"
    cbdi_df = pd.read_csv(fpath)
    cbdi_df['geometry_grids'] = cbdi_df['geometry_grids'].apply(lambda x: loads(x))
    cbdi_df = gpd.GeoDataFrame(cbdi_df, geometry=cbdi_df['geometry_grids'], crs=crs)
    concat_df = pd.concat([cbdi_df, concat_df], ignore_index = True)
cbdi_df = gpd.GeoDataFrame(concat_df, geometry=concat_df['geometry_grids'], crs=crs)

In [385]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Define the input features (X) and target variable (y)
X = cbdi_df[VAR_COLS]
y = cbdi_df[TARGET_COL]
print(X.shape)
print(y.shape)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the entire dataset
model.fit(X, y)

# Make predictions on the entire dataset
y_pred = model.predict(X)

# Calculate mean squared error
mse = mean_squared_error(y, y_pred)

# Calculate root mean squared error
rmse = np.sqrt(mse)

# Calculate R-squared score
r2 = r2_score(y, y_pred)

# Print the evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate mean squared error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

# Calculate root mean squared error
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Calculate R-squared score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("Train MSE:", train_mse)
print("Train R2 Score:", train_r2)
print("Train RMSE:", train_rmse)
print("Test MSE:", test_mse)
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)

(314, 13)
(314, 1)
MSE: 0.478495275093132
RMSE: 0.6917335289641033
R2 Score: 0.5552832426882437
Train MSE: 0.47989610799661325
Train R2 Score: 0.54559233416858
Train RMSE: 0.6927453413748903
Test MSE: 0.5023325121835587
Test RMSE: 0.7087541972951967
Test R2 Score: 0.5609663959056512


In [386]:
# Define the input features (X) and target variable (y)
X = cbdi_df[VAR_COLS + ['cbdi']]
y = cbdi_df[TARGET_COL]
print(X.shape)
print(y.shape)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the entire dataset
model.fit(X, y)

# Make predictions on the entire dataset
y_pred = model.predict(X)

# Calculate mean squared error
mse = mean_squared_error(y, y_pred)

# Calculate root mean squared error
rmse = np.sqrt(mse)

# Calculate R-squared score
r2 = r2_score(y, y_pred)

# Print the evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the LinearRegression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate mean squared error
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

# Calculate root mean squared error
train_rmse = np.sqrt(train_mse)
test_rmse = np.sqrt(test_mse)

# Calculate R-squared score
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print the evaluation metrics
print("Train MSE:", train_mse)
print("Train R2 Score:", train_r2)
print("Train RMSE:", train_rmse)
print("Test MSE:", test_mse)
print("Test RMSE:", test_rmse)
print("Test R2 Score:", test_r2)

(314, 14)
(314, 1)
MSE: 0.47849449544810474
RMSE: 0.6917329654195358
R2 Score: 0.5552839672956251
Train MSE: 0.47950989263979477
Train R2 Score: 0.5459580366943471
Train RMSE: 0.6924665281728748
Test MSE: 0.505179452116434
Test RMSE: 0.7107597710312775
Test R2 Score: 0.5584781988070058


In [158]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split
# X = tval_cbd[VAR_COLS]
# y = tval_cbd[TARGET_COL]

# # Create an instance of the Random Forest Regressor model
# regressor = RandomForestRegressor()

# # Fit the model to the data
# regressor.fit(X, y)

# # Get feature importances
# importances = regressor.feature_importances_
# # Sort feature importances in descending order
# sorted_indices = np.argsort(importances)[::-1]

# # Select top n features
# top_features = X.columns[sorted_indices[:3]]


# ############ w/o cbdi #######################
# X = tval_cbd[list(top_features)]
# print(X.shape)
# y = tval_cbd[TARGET_COL]

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create an instance of the LinearRegression model
# model = LinearRegression()

# # Fit the model to the training data
# model.fit(X_train, y_train)

# # Make predictions on the training and test sets
# y_train_pred = model.predict(X_train)
# y_test_pred = model.predict(X_test)

# # Calculate mean squared error
# train_mse = mean_squared_error(y_train, y_train_pred)
# test_mse = mean_squared_error(y_test, y_test_pred)

# # Calculate root mean squared error
# train_rmse = np.sqrt(train_mse)
# test_rmse = np.sqrt(test_mse)

# # Calculate R-squared score
# train_r2 = r2_score(y_train, y_train_pred)
# test_r2 = r2_score(y_test, y_test_pred)

# # Print the evaluation metrics
# print("Train MSE:", train_mse)
# print("Test MSE:", test_mse)
# print("Train RMSE:", train_rmse)
# print("Test RMSE:", test_rmse)
# print("Train R2 Score:", train_r2)
# print("Test R2 Score:", test_r2)
# print('---------------------')
# ############ w/ cbdi #######################

# X = tval_cbd[list(top_features) + ['cbdi']]
# print(X.shape)
# y = tval_cbd[TARGET_COL]

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# # Create an instance of the LinearRegression model
# model = LinearRegression()

# # Fit the model to the training data
# model.fit(X_train, y_train)

# # Make predictions on the training and test sets
# y_train_pred = model.predict(X_train)
# y_test_pred = model.predict(X_test)

# # Calculate mean squared error
# train_mse = mean_squared_error(y_train, y_train_pred)
# test_mse = mean_squared_error(y_test, y_test_pred)

# # Calculate root mean squared error
# train_rmse = np.sqrt(train_mse)
# test_rmse = np.sqrt(test_mse)

# # Calculate R-squared score
# train_r2 = r2_score(y_train, y_train_pred)
# test_r2 = r2_score(y_test, y_test_pred)

# # Print the evaluation metrics
# print("Train MSE:", train_mse)
# print("Test MSE:", test_mse)
# print("Train RMSE:", train_rmse)
# print("Test RMSE:", test_rmse)
# print("Train R2 Score:", train_r2)
# print("Test R2 Score:", test_r2)
