In [1]:
# # for colab

# from google.colab import drive
# import os

# drive.mount('/content/drive')
# main_dir = '/content/drive/MyDrive/Graduate Project'
# os.chdir(main_dir)
# os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/Graduate Project'

In [11]:
import pandas as pd

keys_1 = [
    'freq_CH_0', 'binary_CH_0', 'features_CH_0', 'dummies_CH_0',
    'freq_CH_1', 'binary_CH_1', 'features_CH_1', 'dummies_CH_1',
    'freq_NY_0', 'binary_NY_0', 'features_NY_0', 'dummies_NY_0',
    'freq_NY_1', 'binary_NY_1', 'features_NY_1', 'dummies_NY_1',
    'freq_LA_0', 'binary_LA_0', 'features_LA_0', 'dummies_LA_0',
    'freq_LA_1', 'binary_LA_1', 'features_LA_1', 'dummies_LA_1'
]
X_combined_dict = {}
for key in keys_1:
    X_combined_dict[key] = pd.read_hdf('../dataset/raw/3. ha_freq_binary.h5', key=key)

keys_2 = ['CH_SF', 'CH_CT', 'NY_SF', 'NY_CT', 'LA_SF', 'LA_CT']
tfidf_dict = {}
for key in keys_2:
    tfidf_dict[key] = pd.read_hdf('../dataset/raw/3. tfidf.h5', key=key)

keys_3 = [
    'freq_CH_0', 'freq_CH_1', 'freq_NY_0', 'freq_NY_1', 'freq_LA_0', 'freq_LA_1',
    'binary_CH_0', 'binary_CH_1', 'binary_NY_0', 'binary_NY_1', 'binary_LA_0', 'binary_LA_1'
]
llama_dict = {}
for key in keys_3:
    llama_dict[key] = pd.read_hdf('../dataset/raw/6. llama_extracted.h5', key=key)

In [12]:
w2v_emb = pd.read_csv('../dataset/raw/4. w2v_embedding.csv')
w2v_pca = pd.read_csv('../dataset/raw/4. w2v_pca.csv')
bert_emb = pd.read_csv('../dataset/raw/4. bert_embedding.csv')
bert_pca = pd.read_csv('../dataset/raw/4. bert_pca.csv')
stf_emb = pd.read_csv('../dataset/raw/5. stf_embedding.csv')
stf_pca = pd.read_csv('../dataset/raw/5. stf_pca.csv')
# gpt_emb = pd.read_csv(os.path.join(main_dir, 'dataset', '5. gpt_embedding.csv'))
# gpt_pca = pd.read_csv(os.path.join(main_dir, 'dataset', '5. gpt_pca.csv'))

In [14]:
import requests
from io import StringIO

# Dropbox direct download link
url_gpt_emb = 'https://www.dropbox.com/scl/fi/gcrk2mejy3su7nt9gf30i/5.-stf_embedding.csv?rlkey=d5uy0qm80geh81qxhxbtyms66&st=46sngliz&dl=1'

# Load directly into DataFrame
response = requests.get(url_gpt_emb)
if response.status_code == 200:
    gpt_emb = pd.read_csv(StringIO(response.text))
    print("CSV loaded successfully:", gpt_emb.shape)
else:
    print("Failed to fetch the file:", response.status_code)

CSV loaded successfully: (10111, 385)


In [15]:
# Dropbox direct download link
url_gpt_pca = 'https://www.dropbox.com/scl/fi/6gdiftk79r00a3uecf9zv/5.-stf_pca.csv?rlkey=j8e7e5tt81w2yt6fd3968mdwp&st=5wwf7fib&dl=1'

# Load directly into DataFrame
response = requests.get(url_gpt_pca)
if response.status_code == 200:
    gpt_pca = pd.read_csv(StringIO(response.text))
    print("CSV loaded successfully:", gpt_pca.shape)
else:
    print("Failed to fetch the file:", response.status_code)

CSV loaded successfully: (10111, 33)


# Algorithm

In [18]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


def calculate_metrics(y_true, y_pred, n_features):
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - ((1 - r2) * (len(y_true) - 1) / (len(y_true) - n_features - 1))
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return r2, adj_r2, rmse

## OLS

In [5]:
from sklearn.model_selection import KFold
import statsmodels.api as sm


def fit_ols(df, city, single, target_var):
    if single == 0:
        print(f"\n[Results for {city} - 'Single Family']")
    else:
        print(f"\n[Results for {city} - 'Condo/Townhouse']")

    X = df.drop(columns=['zpid', target_var])
    y = df[target_var]

    # Train-test split (80-20)
    split_index = int(len(X) * 0.8)
    X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

    # Add constant to the training and test data
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Model
    model = sm.OLS(y_train, X_train).fit()

    # Testing Metrics
    test_predictions = model.predict(X_test)
    r2_test, adj_r2_test, rmse_test = calculate_metrics(y_test, test_predictions, X_test.shape[1] - 1)

    # Output results
    print("\nModel Testing Metrics:")
    print(f"R-squared: {r2_test:.4f}")
    print(f"Adjusted R-squared: {adj_r2_test:.4f}")
    print(f"RMSE: {rmse_test:.4f}")

## Random Forest

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np


def fit_rf(df, city, single, target_var):
    if single == 0:
        print(f"\n[Results for {city} - 'Single Family']")
    else:
        print(f"\n[Results for {city} - 'Condo/Townhouse']")

    df = pd.DataFrame(df)
    X = df.drop(columns=['zpid', target_var])
    y = df[target_var]

    # Model
    rf = RandomForestRegressor()

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'bootstrap': [True, False],
        'max_depth': [None, 10, 20],
        'max_features': [None, 'sqrt'],
        'n_estimators': [32, 64, 100, 500]
    }
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)
    best_rf = grid_search.best_estimator_
    print(f'\nBest Hyperparameters: {grid_search.best_params_}')

    # Train-test split (80-20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the best model on the training set
    best_rf.fit(X_train, y_train)
    y_train_pred = best_rf.predict(X_train)
    y_test_pred = best_rf.predict(X_test)

    # Compute metrics
    r2_train, adj_r2_train, rmse_train = calculate_metrics(y_train.to_numpy(), y_train_pred, X_train.shape[1])
    r2_test, adj_r2_test, rmse_test = calculate_metrics(y_test.to_numpy(), y_test_pred, X_test.shape[1])

    # Output results
    print("\nModel Testing Metrics:")
    print(f"R-squared: {r2_test:.4f}")
    print(f"Adjusted R-squared: {adj_r2_test:.4f}")
    print(f"RMSE: {rmse_test:.4f}")

## XGBoost

In [8]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV
import numpy as np


def fit_xgb(df, city, single, target_var):
    if single == 0:
        print(f"\n[Results for {city} - 'Single Family']")
    else:
        print(f"\n[Results for {city} - 'Condo/Townhouse']")

    df = pd.DataFrame(df)
    X = df.drop(columns=['zpid', target_var])
    y = df[target_var]

    # Model
    xgb = XGBRegressor()

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'n_estimators': [500, 1000],
        'max_depth': [4, 6],
        'learning_rate': [0.01, 0.02],
        'subsample': [0.8, 1.0],
    }
    grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)
    best_xgb = grid_search.best_estimator_
    print(f'\nBest Hyperparameters: {grid_search.best_params_}')

    # Train-test split (80-20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the best model on the training set
    best_xgb.fit(X_train, y_train)
    y_train_pred = best_xgb.predict(X_train)
    y_test_pred = best_xgb.predict(X_test)

    # Compute metrics
    r2_train, adj_r2_train, rmse_train = calculate_metrics(y_train.to_numpy(), y_train_pred, X_train.shape[1])
    r2_test, adj_r2_test, rmse_test = calculate_metrics(y_test.to_numpy(), y_test_pred, X_test.shape[1])

    # Output results
    print("\nModel Testing Metrics:")
    print(f"R-squared: {r2_test:.4f}")
    print(f"Adjusted R-squared: {adj_r2_test:.4f}")
    print(f"RMSE: {rmse_test:.4f}")

## DNN

In [8]:
!pip install scikeras --quiet

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor

def create_dnn_model(input_dim, nodes, activation, dropout_rate, learning_rate):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(nodes, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(nodes*2, activation=activation))
    model.add(Dropout(dropout_rate/2))
    model.add(Dense(nodes, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    return model

def fit_dnn(df, city, single, target_var):
    if single == 0:
        print(f"\n[Results for {city} - 'Single Family']")
    else:
        print(f"\n[Results for {city} - 'Condo/Townhouse']")

    df = pd.DataFrame(df)
    X = df.drop(columns=['zpid', target_var])
    y = df[target_var]

    # Model
    model = KerasRegressor(model=create_dnn_model, input_dim=X.shape[1], verbose=0)

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'model__nodes': [50, 100],
        'model__dropout_rate': [0.1, 0.2],
        'model__activation': ['relu', 'sigmoid'],
        'model__learning_rate': [0.0001, 0.001, 0.01],
        'epochs': [50, 75, 100],
        'batch_size': [10, 20]
    }

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    print(f'\nBest Hyperparameters: {best_params}')

    # Train-test split (80-20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the best model on the training set
    best_model = create_dnn_model(input_dim=X_train.shape[1],
                                  nodes=best_params['model__nodes'],
                                  dropout_rate=best_params['model__dropout_rate'])

    best_model.fit(X_train, y_train,
                   epochs=best_params['epochs'],
                   batch_size=best_params['batch_size'],
                   verbose=0)

    y_test_pred = best_model.predict(X_test).flatten()

    # Compute metrics
    r2_test, adj_r2_test, rmse_test = calculate_metrics(y_test.to_numpy(), y_test_pred, X_test.shape[1])

    # Output results
    print("\nModel Testing Metrics:")
    print(f"R-squared: {r2_test:.4f}")
    print(f"Adjusted R-squared: {adj_r2_test:.4f}")
    print(f"RMSE: {rmse_test:.4f}")

# Input Feature

In [3]:
from datetime import datetime
import pytz
import tensorflow as tf

In [9]:
models_cpu = [
    ('OLS', fit_ols),
    ('RF', fit_rf),
    ('XGB', fit_xgb),
]
models_gpu = [
    ('DNN', fit_dnn)
]

## House Attribute

### Basic

In [16]:
house_attribute_basic = pd.merge(
    X_combined_dict['features_CH_0'],
    X_combined_dict['dummies_CH_0'],
    on='zpid', how='inner'
)
house_attribute_basic

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,CH_Northwest,CH_South,CH_Southwest,CH_West
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,0,1,0,0
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,0,1,0,0
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,1,0,0,0
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,0,1,0,0
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,0,1,0,0
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,0,1,0,0
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,0,0,0,1
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,0,0,0,0


In [19]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 13:40:16

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0096
Adjusted R-squared: -0.0515
RMSE: 0.6337

############## OLS ##############
Current Time (KST): 2025-04-16 13:40:16

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0516
Adjusted R-squared: 0.0189
RMSE: 0.8087

############## OLS ##############
Current Time (KST): 2025-04-16 13:40:16

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0035
Adjusted R-squared: -0.0778
RMSE: 0.5786

############## OLS ##############
Current Time (KST): 2025-04-16 13:40:16

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0510
Adjusted R-squared: 0.0141
RMSE: 0.7060

############## OLS ##############
Current Time (KST): 2025-04-16 13:40:16

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0997
Adjusted R-squared: 0.0760
RMSE: 1.0986

############## OLS #######

In [20]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 13:59:05

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0287
Adjusted R-squared: -0.1418
RMSE: 0.6389

############## OLS ##############
Current Time (KST): 2025-04-16 13:59:05

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0395
Adjusted R-squared: -0.0475
RMSE: 0.8131

############## OLS ##############
Current Time (KST): 2025-04-16 13:59:05

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0973
Adjusted R-squared: -0.3027
RMSE: 0.6072

############## OLS ##############
Current Time (KST): 2025-04-16 13:59:05

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0425
Adjusted R-squared: -0.0433
RMSE: 0.7083

############## OLS ##############
Current Time (KST): 2025-04-16 13:59:05

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0973
Adjusted R-squared: 0.0387
RMSE: 1.1006

############## OLS #####

In [21]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 14:31:30

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0227
Adjusted R-squared: -0.1351
RMSE: 0.6370

############## OLS ##############
Current Time (KST): 2025-04-16 14:31:30

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0406
Adjusted R-squared: -0.0463
RMSE: 0.8126

############## OLS ##############
Current Time (KST): 2025-04-16 14:31:30

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0639
Adjusted R-squared: -0.2630
RMSE: 0.5979

############## OLS ##############
Current Time (KST): 2025-04-16 14:31:30

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0610
Adjusted R-squared: -0.0232
RMSE: 0.7014

############## OLS ##############
Current Time (KST): 2025-04-16 14:31:30

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.1041
Adjusted R-squared: 0.0458
RMSE: 1.0965

############## OLS #####

## Word Count

### TF-IDF

In [22]:
word_count_tfidf = pd.merge(
    house_attribute_basic,
    tfidf_dict['CH_SF'],
    on='zpid', how='inner'
)
word_count_tfidf

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,park,hardwood,great,dining,perfect,beautiful,updated,window,main,second
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.040011,0.034306,0.044187,0.035634,0.000000,0.000000,0.045741,0.000000,0.041493,0.042880
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.052483,0.089997,0.000000,0.093481,0.107800,0.056245,0.059998,0.051851,0.054426,0.000000
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,0.040407,0.000000,0.000000,0.035986,0.082997,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.039820,0.000000,0.000000,0.000000,0.122688,0.042675,0.045522,0.000000,0.000000,0.000000
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.131050,0.000000,0.096485,0.000000,0.044863,0.046815,0.000000,0.000000,0.000000,0.093631
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.000000,0.000000,0.000000,0.000000,0.089709,0.046806,0.049928,0.000000,0.045292,0.000000


In [23]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # TF-IDF
            type_ = 'SF' if single == 0 else 'CT'
            df = pd.merge(
                df,
                tfidf_dict[f'{city}_{type_}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 14:56:48

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.1079
Adjusted R-squared: -0.2976
RMSE: 0.6638

############## OLS ##############
Current Time (KST): 2025-04-16 14:56:48

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0320
Adjusted R-squared: -0.1028
RMSE: 0.8170

############## OLS ##############
Current Time (KST): 2025-04-16 14:56:48

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0969
Adjusted R-squared: -0.4183
RMSE: 0.6049

############## OLS ##############
Current Time (KST): 2025-04-16 14:56:48

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0565
Adjusted R-squared: -0.0691
RMSE: 0.7040

############## OLS ##############
Current Time (KST): 2025-04-16 14:56:48

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0818
Adjusted R-squared: -0.0077
RMSE: 1.1095

############## OLS ####

In [24]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # TF-IDF
            type_ = 'SF' if single == 0 else 'CT'
            df = pd.merge(
                df,
                tfidf_dict[f'{city}_{type_}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 16:08:54

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.1277
Adjusted R-squared: -0.4178
RMSE: 0.6689

############## OLS ##############
Current Time (KST): 2025-04-16 16:08:54

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0169
Adjusted R-squared: -0.1870
RMSE: 0.8226

############## OLS ##############
Current Time (KST): 2025-04-16 16:08:54

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1570
Adjusted R-squared: -0.6898
RMSE: 0.6235

############## OLS ##############
Current Time (KST): 2025-04-16 16:08:54

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0496
Adjusted R-squared: -0.1342
RMSE: 0.7057

############## OLS ##############
Current Time (KST): 2025-04-16 16:08:54

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0727
Adjusted R-squared: -0.0586
RMSE: 1.1155

############## OLS ####

In [25]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # TF-IDF
            type_ = 'SF' if single == 0 else 'CT'
            df = pd.merge(
                df,
                tfidf_dict[f'{city}_{type_}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 17:37:27

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.1109
Adjusted R-squared: -0.3967
RMSE: 0.6639

############## OLS ##############
Current Time (KST): 2025-04-16 17:37:27

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0198
Adjusted R-squared: -0.1835
RMSE: 0.8214

############## OLS ##############
Current Time (KST): 2025-04-16 17:37:27

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1212
Adjusted R-squared: -0.6374
RMSE: 0.6138

############## OLS ##############
Current Time (KST): 2025-04-16 17:37:27

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0693
Adjusted R-squared: -0.1108
RMSE: 0.6983

############## OLS ##############
Current Time (KST): 2025-04-16 17:37:27

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0764
Adjusted R-squared: -0.0543
RMSE: 1.1133

############## OLS ####

### Frequency

In [26]:
word_count_freq = pd.merge(
    house_attribute_basic,
    X_combined_dict['freq_CH_0'],
    on='zpid', how='inner'
)
word_count_freq

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,offer,park,perfect,property,room,second,space,spacious,storage,window
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,4,1,0,1,3,1,3,1,0,0
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0,0,0,0,0,0,0,0,0,0
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,1,1,2,0,3,0,1,1,2,1
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,1,0,0,2,1,0,1,0,0,0
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,3,1,2,2,2,0,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0,0,0,2,0,0,0,0,0,0
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,3,1,3,0,1,0,3,2,0,0
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0,3,1,0,2,2,2,0,0,0
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,1,0,2,2,2,0,3,1,3,0


In [27]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Frequency
            df = pd.merge(
                df,
                X_combined_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 18:59:45

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0763
Adjusted R-squared: -0.2605
RMSE: 0.6543

############## OLS ##############
Current Time (KST): 2025-04-16 18:59:45

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0259
Adjusted R-squared: -0.1097
RMSE: 0.8195

############## OLS ##############
Current Time (KST): 2025-04-16 18:59:45

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1306
Adjusted R-squared: -0.4619
RMSE: 0.6142

############## OLS ##############
Current Time (KST): 2025-04-16 18:59:45

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0396
Adjusted R-squared: -0.0883
RMSE: 0.7103

############## OLS ##############
Current Time (KST): 2025-04-16 18:59:45

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0955
Adjusted R-squared: 0.0074
RMSE: 1.1012

############## OLS #####

In [28]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Frequency
            df = pd.merge(
                df,
                X_combined_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 19:39:14

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0960
Adjusted R-squared: -0.3780
RMSE: 0.6595

############## OLS ##############
Current Time (KST): 2025-04-16 19:39:15

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0115
Adjusted R-squared: -0.1934
RMSE: 0.8249

############## OLS ##############
Current Time (KST): 2025-04-16 19:39:15

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1867
Adjusted R-squared: -0.7330
RMSE: 0.6315

############## OLS ##############
Current Time (KST): 2025-04-16 19:39:15

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0306
Adjusted R-squared: -0.1569
RMSE: 0.7127

############## OLS ##############
Current Time (KST): 2025-04-16 19:39:15

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0874
Adjusted R-squared: -0.0418
RMSE: 1.1067

############## OLS ####

In [29]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Frequency
            df = pd.merge(
                df,
                X_combined_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 20:31:54

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0794
Adjusted R-squared: -0.3571
RMSE: 0.6545

############## OLS ##############
Current Time (KST): 2025-04-16 20:31:54

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0144
Adjusted R-squared: -0.1900
RMSE: 0.8237

############## OLS ##############
Current Time (KST): 2025-04-16 20:31:54

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1648
Adjusted R-squared: -0.7012
RMSE: 0.6256

############## OLS ##############
Current Time (KST): 2025-04-16 20:31:54

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0492
Adjusted R-squared: -0.1348
RMSE: 0.7058

############## OLS ##############
Current Time (KST): 2025-04-16 20:31:54

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0890
Adjusted R-squared: -0.0400
RMSE: 1.1057

############## OLS ####

### Binary (T/F)

In [30]:
word_count_binary = pd.merge(
    house_attribute_basic,
    X_combined_dict['binary_CH_0'],
    on='zpid', how='inner'
)
word_count_binary

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,offer,park,perfect,property,room,second,space,spacious,storage,window
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,1,1,0,1,1,1,1,1,0,0
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0,0,0,0,0,0,0,0,0,0
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,1,1,1,0,1,0,1,1,1,1
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,1,0,0,1,1,0,1,0,0,0
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,1,1,1,1,1,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0,0,0,1,0,0,0,0,0,0
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,1,1,1,0,1,0,1,1,0,0
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0,1,1,0,1,1,1,0,0,0
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,1,0,1,1,1,0,1,1,1,0


In [31]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Binary
            df = pd.merge(
                df,
                X_combined_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 21:18:03

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0470
Adjusted R-squared: -0.2262
RMSE: 0.6453

############## OLS ##############
Current Time (KST): 2025-04-16 21:18:03

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0225
Adjusted R-squared: -0.1136
RMSE: 0.8210

############## OLS ##############
Current Time (KST): 2025-04-16 21:18:03

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0692
Adjusted R-squared: -0.3825
RMSE: 0.5973

############## OLS ##############
Current Time (KST): 2025-04-16 21:18:03

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0478
Adjusted R-squared: -0.0790
RMSE: 0.7072

############## OLS ##############
Current Time (KST): 2025-04-16 21:18:03

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0860
Adjusted R-squared: -0.0030
RMSE: 1.1069

############## OLS ####

In [32]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Binary
            df = pd.merge(
                df,
                X_combined_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 21:50:37

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0605
Adjusted R-squared: -0.3333
RMSE: 0.6487

############## OLS ##############
Current Time (KST): 2025-04-16 21:50:37

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0089
Adjusted R-squared: -0.1966
RMSE: 0.8259

############## OLS ##############
Current Time (KST): 2025-04-16 21:50:37

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1545
Adjusted R-squared: -0.6860
RMSE: 0.6228

############## OLS ##############
Current Time (KST): 2025-04-16 21:50:37

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0433
Adjusted R-squared: -0.1418
RMSE: 0.7080

############## OLS ##############
Current Time (KST): 2025-04-16 21:50:37

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0805
Adjusted R-squared: -0.0497
RMSE: 1.1108

############## OLS ####

In [33]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Binary
            df = pd.merge(
                df,
                X_combined_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 22:36:45

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0528
Adjusted R-squared: -0.3236
RMSE: 0.6463

############## OLS ##############
Current Time (KST): 2025-04-16 22:36:45

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0104
Adjusted R-squared: -0.1948
RMSE: 0.8253

############## OLS ##############
Current Time (KST): 2025-04-16 22:36:45

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1149
Adjusted R-squared: -0.6283
RMSE: 0.6121

############## OLS ##############
Current Time (KST): 2025-04-16 22:36:45

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0584
Adjusted R-squared: -0.1237
RMSE: 0.7024

############## OLS ##############
Current Time (KST): 2025-04-16 22:36:45

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0877
Adjusted R-squared: -0.0415
RMSE: 1.1065

############## OLS ####

## Word Embedding

### Word2Vec

In [34]:
house_attribute_basic['zpid'] = house_attribute_basic['zpid'].astype(int)

word_embedding_w2v = pd.merge(
    house_attribute_basic,
    w2v_emb,
    on='zpid', how='inner'
)
word_embedding_w2v

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,embed_246,embed_247,embed_248,embed_249,embed_250,embed_251,embed_252,embed_253,embed_254,embed_255
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.223464,-0.007206,-0.162735,-0.102017,-0.255791,-0.041559,-0.251558,0.012404,-0.272424,0.197628
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.183473,-0.027997,0.089133,-0.094464,-0.156036,0.259032,-0.106731,-0.028881,-0.244925,-0.213802
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.213268,-0.020160,-0.143695,-0.115935,-0.208955,-0.126121,-0.210337,0.065599,-0.239116,0.223258
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.089511,-0.163466,0.002611,-0.133744,-0.334398,0.209188,-0.059271,0.102027,-0.206558,0.008846
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,0.203364,-0.096827,-0.013891,-0.004618,-0.180039,0.130514,-0.172449,0.036378,-0.158418,-0.040934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.131782,-0.112113,0.044428,-0.099765,-0.282524,0.317623,-0.061352,0.023665,-0.231226,-0.136804
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.221391,0.013927,-0.062395,-0.081934,-0.166114,0.029040,-0.193462,0.014361,-0.242746,0.044690
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.177176,-0.060876,-0.138395,-0.142217,-0.302656,-0.023339,-0.187376,0.074042,-0.262774,0.232845
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.162928,-0.063157,-0.047558,-0.115524,-0.230991,0.053731,-0.178668,0.065665,-0.189680,0.092720


In [35]:
word_embedding_w2v_pca = pd.merge(
    house_attribute_basic,
    w2v_pca,
    on='zpid', how='inner'
)
word_embedding_w2v_pca

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,-0.012387,-0.000549,0.004977,0.001819,0.002223,-0.004770,0.007689,0.000074,-0.006769,-0.000810
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.007043,0.006386,-0.021022,-0.010440,-0.007260,0.007682,-0.008644,-0.005621,0.008528,-0.000807
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.017369,0.010176,-0.006706,-0.002855,-0.002011,0.000952,-0.005180,-0.006513,0.001675,0.005536
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.001276,-0.004725,0.017497,0.021998,0.022062,-0.002967,-0.013703,0.017604,-0.001542,-0.004684
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.009105,0.006408,0.003389,-0.000759,-0.003152,0.000812,-0.000406,0.002651,0.004350,0.000274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.009106,-0.003481,0.005066,-0.010062,-0.000403,-0.008653,-0.016338,0.010330,0.004095,-0.012030
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.010199,0.005891,-0.007049,0.005434,0.000235,-0.010645,-0.003536,0.001607,-0.003288,0.000053
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.000281,-0.001413,-0.016239,0.003292,0.001671,0.000450,0.002700,0.005777,-0.005455,-0.002098
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,-0.006139,0.008565,0.007564,0.008352,0.000864,-0.005409,0.006516,0.005086,0.001877,0.002017


In [36]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Word2Vec
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                w2v_pca,
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-16 23:16:18

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0859
Adjusted R-squared: -0.2718
RMSE: 0.6572

############## OLS ##############
Current Time (KST): 2025-04-16 23:16:19

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0318
Adjusted R-squared: -0.1030
RMSE: 0.8171

############## OLS ##############
Current Time (KST): 2025-04-16 23:16:19

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0480
Adjusted R-squared: -0.3551
RMSE: 0.5913

############## OLS ##############
Current Time (KST): 2025-04-16 23:16:19

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0256
Adjusted R-squared: -0.1042
RMSE: 0.7154

############## OLS ##############
Current Time (KST): 2025-04-16 23:16:19

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0986
Adjusted R-squared: 0.0108
RMSE: 1.0993

############## OLS #####

In [37]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Word2Vec
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                w2v_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 01:09:27

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.1076
Adjusted R-squared: -0.3925
RMSE: 0.6629

############## OLS ##############
Current Time (KST): 2025-04-17 01:09:27

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0115
Adjusted R-squared: -0.1935
RMSE: 0.8249

############## OLS ##############
Current Time (KST): 2025-04-17 01:09:27

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1119
Adjusted R-squared: -0.6239
RMSE: 0.6113

############## OLS ##############
Current Time (KST): 2025-04-17 01:09:27

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0154
Adjusted R-squared: -0.1751
RMSE: 0.7183

############## OLS ##############
Current Time (KST): 2025-04-17 01:09:27

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0960
Adjusted R-squared: -0.0320
RMSE: 1.1015

############## OLS ####

In [38]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Word2Vec
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                w2v_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 03:15:29

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0887
Adjusted R-squared: -0.3688
RMSE: 0.6573

############## OLS ##############
Current Time (KST): 2025-04-17 03:15:30

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0176
Adjusted R-squared: -0.1861
RMSE: 0.8223

############## OLS ##############
Current Time (KST): 2025-04-17 03:15:30

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0953
Adjusted R-squared: -0.5996
RMSE: 0.6067

############## OLS ##############
Current Time (KST): 2025-04-17 03:15:30

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0302
Adjusted R-squared: -0.1575
RMSE: 0.7129

############## OLS ##############
Current Time (KST): 2025-04-17 03:15:30

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.1039
Adjusted R-squared: -0.0229
RMSE: 1.0966

############## OLS ####

### BERT

In [39]:
word_embedding_bert = pd.merge(
    house_attribute_basic,
    bert_emb,
    on='zpid', how='inner'
)
word_embedding_bert

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.002209,-0.066249,0.021836,0.003349,-0.158957,-0.161074,-0.080721,-0.021704,0.088047,-0.194490
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.110557,-0.127086,0.028052,-0.063860,-0.154856,-0.082603,-0.231665,0.029396,-0.021418,-0.165803
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.195316,-0.083266,-0.072137,0.110479,-0.169599,-0.422717,0.066835,0.048301,0.037172,-0.148788
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.294773,-0.068021,0.086495,-0.140743,-0.201618,-0.158005,-0.034060,0.108299,-0.008776,-0.069019
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.095778,-0.153731,0.040213,0.025163,-0.375998,-0.533569,-0.005384,-0.109406,-0.059209,-0.185234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.345192,-0.050782,0.051460,-0.269730,-0.079036,-0.170702,-0.007881,0.125350,-0.039494,-0.024820
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.241844,-0.181451,0.059410,-0.060932,-0.257907,-0.303960,-0.014816,0.020160,0.017476,-0.196169
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.141273,-0.121933,-0.098506,-0.013390,-0.206108,-0.273568,-0.033298,0.070359,0.039624,-0.188428
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,-0.144971,-0.167624,0.057370,-0.098211,-0.287077,-0.235150,0.185234,-0.016989,0.015423,-0.056939


In [40]:
word_embedding_bert_pca = pd.merge(
    house_attribute_basic,
    bert_pca,
    on='zpid', how='inner'
)
word_embedding_bert_pca

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.074151,0.302540,-0.297929,-0.219099,-0.074798,-0.013947,0.019658,0.241600,0.000217,0.173076
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,-0.057093,-0.251500,0.072636,-0.003742,0.299997,-0.155731,0.103498,-0.163015,0.224072,0.433368
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.180451,-0.125636,-0.327241,0.090504,-0.092460,0.099371,0.127525,-0.074272,0.203362,0.071731
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,-0.057523,0.155976,0.293204,0.036025,0.807017,-0.016831,0.208986,0.036916,-0.378630,0.345290
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.100179,-0.047546,0.354586,-0.002388,-0.140394,-0.037001,-0.225810,-0.072347,-0.053413,0.135207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.060178,0.171788,0.032892,0.253508,0.482016,-0.016670,0.441367,0.030835,0.032977,0.269307
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.250856,0.156176,0.074924,-0.049251,-0.024537,-0.292212,0.014764,0.048038,0.109006,0.033126
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.220507,0.220416,-0.074999,0.003914,-0.014332,0.142471,-0.117733,-0.120593,-0.038260,0.091872
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,-0.026316,0.042256,0.011237,0.204519,0.303453,-0.038491,-0.028662,0.222102,-0.232738,-0.193949


In [41]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # BERT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                bert_pca,
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 05:13:34

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0620
Adjusted R-squared: -0.2438
RMSE: 0.6499

############## OLS ##############
Current Time (KST): 2025-04-17 05:13:34

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0699
Adjusted R-squared: -0.0596
RMSE: 0.8008

############## OLS ##############
Current Time (KST): 2025-04-17 05:13:34

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0526
Adjusted R-squared: -0.3610
RMSE: 0.5926

############## OLS ##############
Current Time (KST): 2025-04-17 05:13:34

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0442
Adjusted R-squared: -0.0830
RMSE: 0.7085

############## OLS ##############
Current Time (KST): 2025-04-17 05:13:34

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.1289
Adjusted R-squared: 0.0441
RMSE: 1.0807

############## OLS #####

In [42]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # BERT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                bert_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 07:06:48

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0743
Adjusted R-squared: -0.3506
RMSE: 0.6529

############## OLS ##############
Current Time (KST): 2025-04-17 07:06:48

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0548
Adjusted R-squared: -0.1411
RMSE: 0.8066

############## OLS ##############
Current Time (KST): 2025-04-17 07:06:48

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1378
Adjusted R-squared: -0.6617
RMSE: 0.6183

############## OLS ##############
Current Time (KST): 2025-04-17 07:06:48

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0363
Adjusted R-squared: -0.1502
RMSE: 0.7106

############## OLS ##############
Current Time (KST): 2025-04-17 07:06:48

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.1254
Adjusted R-squared: 0.0016
RMSE: 1.0834

############## OLS #####

In [43]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # BERT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                bert_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 09:12:37

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0634
Adjusted R-squared: -0.3369
RMSE: 0.6496

############## OLS ##############
Current Time (KST): 2025-04-17 09:12:37

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0527
Adjusted R-squared: -0.1438
RMSE: 0.8075

############## OLS ##############
Current Time (KST): 2025-04-17 09:12:37

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1283
Adjusted R-squared: -0.6478
RMSE: 0.6157

############## OLS ##############
Current Time (KST): 2025-04-17 09:12:37

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0546
Adjusted R-squared: -0.1283
RMSE: 0.7038

############## OLS ##############
Current Time (KST): 2025-04-17 09:12:37

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.1292
Adjusted R-squared: 0.0059
RMSE: 1.0810

############## OLS #####

## Text Embedding

### Sentence TF

In [44]:
text_embedding_stf = pd.merge(
    house_attribute_basic,
    stf_emb,
    on='zpid', how='inner'
)
text_embedding_stf

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.014884,0.030813,-0.072282,-0.041126,-0.004638,0.042128,0.069364,-0.022453,-0.150843,0.045339
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.098454,-0.018951,0.023943,0.022205,0.007497,0.083833,0.027238,-0.026806,-0.049773,0.020240
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.067548,-0.046245,-0.066852,0.022917,0.057328,0.096687,0.042444,0.023923,-0.088812,0.072610
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.042285,0.000283,-0.018360,0.064053,0.068314,0.040049,0.006282,-0.032294,-0.030286,0.005316
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,0.047137,0.013095,-0.091455,0.000339,0.050769,0.035730,-0.025520,0.042480,-0.059526,0.028172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.029160,0.039548,0.012395,-0.019703,-0.020704,-0.014466,-0.054416,0.015904,-0.073100,0.021451
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.061723,0.015087,-0.034785,-0.012646,0.005323,0.052792,0.082454,-0.029282,-0.079274,0.040100
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.050333,0.068426,-0.092724,-0.029884,0.001411,0.128324,0.086248,-0.030514,-0.113628,0.006726
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.044795,-0.010378,-0.025361,0.004221,0.015714,0.018236,0.082321,0.002339,-0.087777,0.018066


In [45]:
text_embedding_stf_pca = pd.merge(
    house_attribute_basic,
    stf_pca,
    on='zpid', how='inner'
)
text_embedding_stf_pca

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,-0.052005,0.004084,0.094228,-0.126335,-0.057718,-0.014378,0.041700,-0.037116,0.122407,0.014602
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.008417,-0.051992,-0.013960,-0.025140,0.034091,-0.079997,0.026623,0.075764,0.037496,0.048367
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.008463,0.063568,0.010244,-0.048827,0.011803,0.051097,0.167714,0.033750,-0.017540,-0.076804
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.074449,0.018354,-0.040208,0.078236,0.076124,0.030177,0.053090,-0.024083,-0.005804,0.002427
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.081950,-0.062769,0.198024,0.013443,0.006049,-0.035317,0.012005,-0.000749,0.057624,0.063310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.079875,-0.009341,-0.154927,-0.012613,0.074548,-0.061339,0.080912,-0.012805,0.066575,-0.039803
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.000482,0.068557,0.089715,-0.069765,0.009431,-0.159541,-0.088961,-0.052833,0.055982,0.022920
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.035491,0.005164,-0.040089,-0.097735,0.062097,0.035229,0.021420,-0.051228,0.095142,-0.016373
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.022669,-0.105762,-0.063176,0.030789,0.009812,0.016110,-0.105148,-0.089061,-0.048036,0.144044


In [46]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Sentence TF (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                stf_pca,
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 11:11:38

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0664
Adjusted R-squared: -0.2490
RMSE: 0.6513

############## OLS ##############
Current Time (KST): 2025-04-17 11:11:38

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0364
Adjusted R-squared: -0.0978
RMSE: 0.8151

############## OLS ##############
Current Time (KST): 2025-04-17 11:11:38

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0166
Adjusted R-squared: -0.3144
RMSE: 0.5824

############## OLS ##############
Current Time (KST): 2025-04-17 11:11:38

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0434
Adjusted R-squared: -0.0840
RMSE: 0.7088

############## OLS ##############
Current Time (KST): 2025-04-17 11:11:38

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0978
Adjusted R-squared: 0.0100
RMSE: 1.0998

############## OLS #####

In [47]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Sentence TF (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                stf_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 13:05:54

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0876
Adjusted R-squared: -0.3675
RMSE: 0.6569

############## OLS ##############
Current Time (KST): 2025-04-17 13:05:54

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0149
Adjusted R-squared: -0.1894
RMSE: 0.8234

############## OLS ##############
Current Time (KST): 2025-04-17 13:05:54

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1119
Adjusted R-squared: -0.6239
RMSE: 0.6113

############## OLS ##############
Current Time (KST): 2025-04-17 13:05:54

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0264
Adjusted R-squared: -0.1619
RMSE: 0.7142

############## OLS ##############
Current Time (KST): 2025-04-17 13:05:54

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0928
Adjusted R-squared: -0.0357
RMSE: 1.1034

############## OLS ####

In [48]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Sentence TF (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                stf_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 15:13:37

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0656
Adjusted R-squared: -0.3397
RMSE: 0.6503

############## OLS ##############
Current Time (KST): 2025-04-17 15:13:37

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0190
Adjusted R-squared: -0.1844
RMSE: 0.8217

############## OLS ##############
Current Time (KST): 2025-04-17 15:13:37

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0715
Adjusted R-squared: -0.5649
RMSE: 0.6001

############## OLS ##############
Current Time (KST): 2025-04-17 15:13:37

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0478
Adjusted R-squared: -0.1364
RMSE: 0.7063

############## OLS ##############
Current Time (KST): 2025-04-17 15:13:37

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0949
Adjusted R-squared: -0.0332
RMSE: 1.1021

############## OLS ####

### GPT

In [49]:
text_embedding_gpt = pd.merge(
    house_attribute_basic,
    gpt_emb,
    on='zpid', how='inner'
)
text_embedding_gpt

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.014884,0.030813,-0.072282,-0.041126,-0.004638,0.042128,0.069364,-0.022453,-0.150843,0.045339
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.098454,-0.018951,0.023943,0.022205,0.007497,0.083833,0.027238,-0.026806,-0.049773,0.020240
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.067548,-0.046245,-0.066852,0.022917,0.057328,0.096687,0.042444,0.023923,-0.088812,0.072610
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.042285,0.000283,-0.018360,0.064053,0.068314,0.040049,0.006282,-0.032294,-0.030286,0.005316
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,0.047137,0.013095,-0.091455,0.000339,0.050769,0.035730,-0.025520,0.042480,-0.059526,0.028172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.029160,0.039548,0.012395,-0.019703,-0.020704,-0.014466,-0.054416,0.015904,-0.073100,0.021451
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.061723,0.015087,-0.034785,-0.012646,0.005323,0.052792,0.082454,-0.029282,-0.079274,0.040100
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.050333,0.068426,-0.092724,-0.029884,0.001411,0.128324,0.086248,-0.030514,-0.113628,0.006726
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.044795,-0.010378,-0.025361,0.004221,0.015714,0.018236,0.082321,0.002339,-0.087777,0.018066


In [50]:
text_embedding_gpd_pca = pd.merge(
    house_attribute_basic,
    gpt_pca,
    on='zpid', how='inner'
)
text_embedding_gpd_pca

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,-0.052005,0.004084,0.094228,-0.126335,-0.057718,-0.014378,0.041700,-0.037116,0.122407,0.014602
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.008417,-0.051992,-0.013960,-0.025140,0.034091,-0.079997,0.026623,0.075764,0.037496,0.048367
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.008463,0.063568,0.010244,-0.048827,0.011803,0.051097,0.167714,0.033750,-0.017540,-0.076804
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.074449,0.018354,-0.040208,0.078236,0.076124,0.030177,0.053090,-0.024083,-0.005804,0.002427
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.081950,-0.062769,0.198024,0.013443,0.006049,-0.035317,0.012005,-0.000749,0.057624,0.063310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.079875,-0.009341,-0.154927,-0.012613,0.074548,-0.061339,0.080912,-0.012805,0.066575,-0.039803
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.000482,0.068557,0.089715,-0.069765,0.009431,-0.159541,-0.088961,-0.052833,0.055982,0.022920
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.035491,0.005164,-0.040089,-0.097735,0.062097,0.035229,0.021420,-0.051228,0.095142,-0.016373
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.022669,-0.105762,-0.063176,0.030789,0.009812,0.016110,-0.105148,-0.089061,-0.048036,0.144044


In [51]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # GPT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                gpt_pca,
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 17:16:22

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0664
Adjusted R-squared: -0.2490
RMSE: 0.6513

############## OLS ##############
Current Time (KST): 2025-04-17 17:16:23

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0364
Adjusted R-squared: -0.0978
RMSE: 0.8151

############## OLS ##############
Current Time (KST): 2025-04-17 17:16:23

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0166
Adjusted R-squared: -0.3144
RMSE: 0.5824

############## OLS ##############
Current Time (KST): 2025-04-17 17:16:23

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0434
Adjusted R-squared: -0.0840
RMSE: 0.7088

############## OLS ##############
Current Time (KST): 2025-04-17 17:16:23

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0978
Adjusted R-squared: 0.0100
RMSE: 1.0998

############## OLS #####

In [52]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # GPT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                gpt_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 19:16:47

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0876
Adjusted R-squared: -0.3675
RMSE: 0.6569

############## OLS ##############
Current Time (KST): 2025-04-17 19:16:47

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0149
Adjusted R-squared: -0.1894
RMSE: 0.8234

############## OLS ##############
Current Time (KST): 2025-04-17 19:16:47

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.1119
Adjusted R-squared: -0.6239
RMSE: 0.6113

############## OLS ##############
Current Time (KST): 2025-04-17 19:16:47

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0264
Adjusted R-squared: -0.1619
RMSE: 0.7142

############## OLS ##############
Current Time (KST): 2025-04-17 19:16:47

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0928
Adjusted R-squared: -0.0357
RMSE: 1.1034

############## OLS ####

In [53]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # GPT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                gpt_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(TOM)')


############## OLS ##############
Current Time (KST): 2025-04-17 21:30:05

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: -0.0656
Adjusted R-squared: -0.3397
RMSE: 0.6503

############## OLS ##############
Current Time (KST): 2025-04-17 21:30:05

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0190
Adjusted R-squared: -0.1844
RMSE: 0.8217

############## OLS ##############
Current Time (KST): 2025-04-17 21:30:05

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: -0.0715
Adjusted R-squared: -0.5649
RMSE: 0.6001

############## OLS ##############
Current Time (KST): 2025-04-17 21:30:05

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.0478
Adjusted R-squared: -0.1364
RMSE: 0.7063

############## OLS ##############
Current Time (KST): 2025-04-17 21:30:06

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.0949
Adjusted R-squared: -0.0332
RMSE: 1.1021

############## OLS ####