In [1]:
# for colab

from google.colab import drive
import os

drive.mount('/content/drive')
main_dir = '/content/drive/MyDrive/Graduate Project'
os.chdir(main_dir)
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive/Graduate Project'

In [2]:
import pandas as pd

keys_1 = [
    'freq_CH_0', 'binary_CH_0', 'features_CH_0', 'dummies_CH_0',
    'freq_CH_1', 'binary_CH_1', 'features_CH_1', 'dummies_CH_1',
    'freq_NY_0', 'binary_NY_0', 'features_NY_0', 'dummies_NY_0',
    'freq_NY_1', 'binary_NY_1', 'features_NY_1', 'dummies_NY_1',
    'freq_LA_0', 'binary_LA_0', 'features_LA_0', 'dummies_LA_0',
    'freq_LA_1', 'binary_LA_1', 'features_LA_1', 'dummies_LA_1'
]
X_combined_dict = {}
for key in keys_1:
    X_combined_dict[key] = pd.read_hdf(os.path.join(main_dir, 'dataset', '3. ha_freq_binary.h5'), key=key)

keys_2 = ['CH_SF', 'CH_CT', 'NY_SF', 'NY_CT', 'LA_SF', 'LA_CT']
tfidf_dict = {}
for key in keys_2:
    tfidf_dict[key] = pd.read_hdf(os.path.join(main_dir, 'dataset', '3. tfidf.h5'), key=key)

keys_3 = [
    'freq_CH_0', 'freq_CH_1', 'freq_NY_0', 'freq_NY_1', 'freq_LA_0', 'freq_LA_1',
    'binary_CH_0', 'binary_CH_1', 'binary_NY_0', 'binary_NY_1', 'binary_LA_0', 'binary_LA_1'
]
llama_dict = {}
for key in keys_3:
    llama_dict[key] = pd.read_hdf(os.path.join(main_dir, 'dataset', '6. llama_extracted.h5'), key=key)

In [3]:
w2v_emb = pd.read_csv(os.path.join(main_dir, 'dataset', '4. w2v_embedding.csv'))
w2v_pca = pd.read_csv(os.path.join(main_dir, 'dataset', '4. w2v_pca.csv'))
bert_emb = pd.read_csv(os.path.join(main_dir, 'dataset', '4. bert_embedding.csv'))
bert_pca = pd.read_csv(os.path.join(main_dir, 'dataset', '4. bert_pca.csv'))
stf_emb = pd.read_csv(os.path.join(main_dir, 'dataset', '5. stf_embedding.csv'))
stf_pca = pd.read_csv(os.path.join(main_dir, 'dataset', '5. stf_pca.csv'))
gpt_emb = pd.read_csv(os.path.join(main_dir, 'dataset', '5. gpt_embedding.csv'))
gpt_pca = pd.read_csv(os.path.join(main_dir, 'dataset', '5. gpt_pca.csv'))

# Algorithm

In [4]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


def calculate_metrics(y_true, y_pred, n_features):
    r2 = r2_score(y_true, y_pred)
    adj_r2 = 1 - ((1 - r2) * (len(y_true) - 1) / (len(y_true) - n_features - 1))
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return r2, adj_r2, rmse

## OLS

In [5]:
from sklearn.model_selection import KFold
import statsmodels.api as sm


def fit_ols(df, city, single, target_var):
    if single == 0:
        print(f"\n[Results for {city} - 'Single Family']")
    else:
        print(f"\n[Results for {city} - 'Condo/Townhouse']")

    X = df.drop(columns=['zpid', target_var])
    y = df[target_var]

    # Train-test split (80-20)
    split_index = int(len(X) * 0.8)
    X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

    # Add constant to the training and test data
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Model
    model = sm.OLS(y_train, X_train).fit()

    # Testing Metrics
    test_predictions = model.predict(X_test)
    r2_test, adj_r2_test, rmse_test = calculate_metrics(y_test, test_predictions, X_test.shape[1] - 1)

    # Output results
    print("\nModel Testing Metrics:")
    print(f"R-squared: {r2_test:.4f}")
    print(f"Adjusted R-squared: {adj_r2_test:.4f}")
    print(f"RMSE: {rmse_test:.4f}")

## Random Forest

In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np


def fit_rf(df, city, single, target_var):
    if single == 0:
        print(f"\n[Results for {city} - 'Single Family']")
    else:
        print(f"\n[Results for {city} - 'Condo/Townhouse']")

    df = pd.DataFrame(df)
    X = df.drop(columns=['zpid', target_var])
    y = df[target_var]

    # Model
    rf = RandomForestRegressor()

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'bootstrap': [True, False],
        'max_depth': [None, 10, 20],
        'max_features': [None, 'sqrt'],
        'n_estimators': [32, 64, 100, 500]
    }
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)
    best_rf = grid_search.best_estimator_
    print(f'\nBest Hyperparameters: {grid_search.best_params_}')

    # Train-test split (80-20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the best model on the training set
    best_rf.fit(X_train, y_train)
    y_train_pred = best_rf.predict(X_train)
    y_test_pred = best_rf.predict(X_test)

    # Compute metrics
    r2_train, adj_r2_train, rmse_train = calculate_metrics(y_train.to_numpy(), y_train_pred, X_train.shape[1])
    r2_test, adj_r2_test, rmse_test = calculate_metrics(y_test.to_numpy(), y_test_pred, X_test.shape[1])

    # Output results
    print("\nModel Testing Metrics:")
    print(f"R-squared: {r2_test:.4f}")
    print(f"Adjusted R-squared: {adj_r2_test:.4f}")
    print(f"RMSE: {rmse_test:.4f}")

## XGBoost

In [7]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV
import numpy as np


def fit_xgb(df, city, single, target_var):
    if single == 0:
        print(f"\n[Results for {city} - 'Single Family']")
    else:
        print(f"\n[Results for {city} - 'Condo/Townhouse']")

    df = pd.DataFrame(df)
    X = df.drop(columns=['zpid', target_var])
    y = df[target_var]

    # Model
    xgb = XGBRegressor()

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'n_estimators': [500, 1000],
        'max_depth': [4, 6],
        'learning_rate': [0.01, 0.02],
        'subsample': [0.8, 1.0],
    }
    grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)
    best_xgb = grid_search.best_estimator_
    print(f'\nBest Hyperparameters: {grid_search.best_params_}')

    # Train-test split (80-20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the best model on the training set
    best_xgb.fit(X_train, y_train)
    y_train_pred = best_xgb.predict(X_train)
    y_test_pred = best_xgb.predict(X_test)

    # Compute metrics
    r2_train, adj_r2_train, rmse_train = calculate_metrics(y_train.to_numpy(), y_train_pred, X_train.shape[1])
    r2_test, adj_r2_test, rmse_test = calculate_metrics(y_test.to_numpy(), y_test_pred, X_test.shape[1])

    # Output results
    print("\nModel Testing Metrics:")
    print(f"R-squared: {r2_test:.4f}")
    print(f"Adjusted R-squared: {adj_r2_test:.4f}")
    print(f"RMSE: {rmse_test:.4f}")

## DNN

In [8]:
!pip install scikeras --quiet

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor

def create_dnn_model(input_dim, nodes, activation, dropout_rate, learning_rate):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(nodes, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(nodes*2, activation=activation))
    model.add(Dropout(dropout_rate/2))
    model.add(Dense(nodes, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    return model

def fit_dnn(df, city, single, target_var):
    if single == 0:
        print(f"\n[Results for {city} - 'Single Family']")
    else:
        print(f"\n[Results for {city} - 'Condo/Townhouse']")

    df = pd.DataFrame(df)
    X = df.drop(columns=['zpid', target_var])
    y = df[target_var]

    # Model
    model = KerasRegressor(model=create_dnn_model, input_dim=X.shape[1], verbose=0)

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'model__nodes': [50, 100],
        'model__dropout_rate': [0.1, 0.2],
        'model__activation': ['relu', 'sigmoid'],
        'model__learning_rate': [0.0001, 0.001, 0.01],
        'epochs': [50, 75, 100],
        'batch_size': [10, 20]
    }

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    print(f'\nBest Hyperparameters: {best_params}')

    # Train-test split (80-20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Fit the best model on the training set
    best_model = create_dnn_model(input_dim=X_train.shape[1],
                                  nodes=best_params['model__nodes'],
                                  dropout_rate=best_params['model__dropout_rate'])

    best_model.fit(X_train, y_train,
                   epochs=best_params['epochs'],
                   batch_size=best_params['batch_size'],
                   verbose=0)

    y_test_pred = best_model.predict(X_test).flatten()

    # Compute metrics
    r2_test, adj_r2_test, rmse_test = calculate_metrics(y_test.to_numpy(), y_test_pred, X_test.shape[1])

    # Output results
    print("\nModel Testing Metrics:")
    print(f"R-squared: {r2_test:.4f}")
    print(f"Adjusted R-squared: {adj_r2_test:.4f}")
    print(f"RMSE: {rmse_test:.4f}")

# Input Feature

In [10]:
from datetime import datetime
import pytz
import tensorflow as tf

In [11]:
models_cpu = [
    ('OLS', fit_ols),
    ('RF', fit_rf),
    ('XGB', fit_xgb),
]
models_gpu = [
    ('DNN', fit_dnn)
]

## House Attribute

### Basic

In [12]:
house_attribute_basic = pd.merge(
    X_combined_dict['features_CH_0'],
    X_combined_dict['dummies_CH_0'],
    on='zpid', how='inner'
)
house_attribute_basic

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,CH_Northwest,CH_South,CH_Southwest,CH_West
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,0,1,0,0
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,0,1,0,0
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,1,0,0,0
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,0,1,0,0
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,0,1,0,0
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,0,1,0,0
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,0,0,0,1
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,0,0,0,0


In [None]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-21 10:32:36

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7349
Adjusted R-squared: 0.7239
RMSE: 0.3327

############## OLS ##############
Current Time (KST): 2024-10-21 10:32:36

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7526
Adjusted R-squared: 0.7440
RMSE: 0.3078

############## OLS ##############
Current Time (KST): 2024-10-21 10:32:36

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6471
Adjusted R-squared: 0.6209
RMSE: 0.2193

############## OLS ##############
Current Time (KST): 2024-10-21 10:32:36

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7764
Adjusted R-squared: 0.7677
RMSE: 0.3461

############## OLS ##############
Current Time (KST): 2024-10-21 10:32:36

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7721
Adjusted R-squared: 0.7661
RMSE: 0.2662

############## OLS ###########

In [None]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-21 11:19:33

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7348
Adjusted R-squared: 0.7057
RMSE: 0.3325

############## OLS ##############
Current Time (KST): 2024-10-21 11:19:33

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7547
Adjusted R-squared: 0.7324
RMSE: 0.3067

############## OLS ##############
Current Time (KST): 2024-10-21 11:19:33

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6411
Adjusted R-squared: 0.5739
RMSE: 0.2209

############## OLS ##############
Current Time (KST): 2024-10-21 11:19:33

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7769
Adjusted R-squared: 0.7569
RMSE: 0.3467

############## OLS ##############
Current Time (KST): 2024-10-21 11:19:33

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7799
Adjusted R-squared: 0.7656
RMSE: 0.2643

############## OLS ###########

In [None]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-21 12:18:05

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7448
Adjusted R-squared: 0.7167
RMSE: 0.3262

############## OLS ##############
Current Time (KST): 2024-10-21 12:18:05

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7519
Adjusted R-squared: 0.7295
RMSE: 0.3084

############## OLS ##############
Current Time (KST): 2024-10-21 12:18:05

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6517
Adjusted R-squared: 0.5865
RMSE: 0.2176

############## OLS ##############
Current Time (KST): 2024-10-21 12:18:05

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7756
Adjusted R-squared: 0.7555
RMSE: 0.3477

############## OLS ##############
Current Time (KST): 2024-10-21 12:18:05

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7764
Adjusted R-squared: 0.7619
RMSE: 0.2664

############## OLS ###########

## Word Count

### TF-IDF

In [None]:
word_count_tfidf = pd.merge(
    house_attribute_basic,
    tfidf_dict['CH_SF'],
    on='zpid', how='inner'
)
word_count_tfidf

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,park,hardwood,great,dining,perfect,beautiful,updated,window,main,second
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.040011,0.034306,0.044187,0.035634,0.000000,0.000000,0.045741,0.000000,0.041493,0.042880
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.052483,0.089997,0.000000,0.093481,0.107800,0.056245,0.059998,0.051851,0.054426,0.000000
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,0.040407,0.000000,0.000000,0.035986,0.082997,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.039820,0.000000,0.000000,0.000000,0.122688,0.042675,0.045522,0.000000,0.000000,0.000000
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.131050,0.000000,0.096485,0.000000,0.044863,0.046815,0.000000,0.000000,0.000000,0.093631
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.000000,0.000000,0.000000,0.000000,0.089709,0.046806,0.049928,0.000000,0.045292,0.000000


In [None]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # TF-IDF
            type_ = 'SF' if single == 0 else 'CT'
            df = pd.merge(
                df,
                tfidf_dict[f'{city}_{type_}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-21 13:09:24

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7573
Adjusted R-squared: 0.7157
RMSE: 0.3184

############## OLS ##############
Current Time (KST): 2024-10-21 13:09:24

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7683
Adjusted R-squared: 0.7361
RMSE: 0.2978

############## OLS ##############
Current Time (KST): 2024-10-21 13:09:24

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6303
Adjusted R-squared: 0.5220
RMSE: 0.2245

############## OLS ##############
Current Time (KST): 2024-10-21 13:09:24

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7892
Adjusted R-squared: 0.7611
RMSE: 0.3361

############## OLS ##############
Current Time (KST): 2024-10-21 13:09:24

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7880
Adjusted R-squared: 0.7673
RMSE: 0.2568

############## OLS ###########

In [None]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # TF-IDF
            type_ = 'SF' if single == 0 else 'CT'
            df = pd.merge(
                df,
                tfidf_dict[f'{city}_{type_}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-21 15:53:03

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7510
Adjusted R-squared: 0.6870
RMSE: 0.3222

############## OLS ##############
Current Time (KST): 2024-10-21 15:53:03

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7689
Adjusted R-squared: 0.7210
RMSE: 0.2976

############## OLS ##############
Current Time (KST): 2024-10-21 15:53:03

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6191
Adjusted R-squared: 0.4437
RMSE: 0.2275

############## OLS ##############
Current Time (KST): 2024-10-21 15:53:03

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7914
Adjusted R-squared: 0.7511
RMSE: 0.3352

############## OLS ##############
Current Time (KST): 2024-10-21 15:53:03

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7905
Adjusted R-squared: 0.7609
RMSE: 0.2578

############## OLS ###########

In [None]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # TF-IDF
            type_ = 'SF' if single == 0 else 'CT'
            df = pd.merge(
                df,
                tfidf_dict[f'{city}_{type_}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-21 18:58:26

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7570
Adjusted R-squared: 0.6945
RMSE: 0.3183

############## OLS ##############
Current Time (KST): 2024-10-21 18:58:26

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7669
Adjusted R-squared: 0.7186
RMSE: 0.2989

############## OLS ##############
Current Time (KST): 2024-10-21 18:58:26

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6277
Adjusted R-squared: 0.4562
RMSE: 0.2250

############## OLS ##############
Current Time (KST): 2024-10-21 18:58:27

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7900
Adjusted R-squared: 0.7493
RMSE: 0.3364

############## OLS ##############
Current Time (KST): 2024-10-21 18:58:27

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7878
Adjusted R-squared: 0.7577
RMSE: 0.2595

############## OLS ###########

### Frequency

In [None]:
word_count_freq = pd.merge(
    house_attribute_basic,
    X_combined_dict['freq_CH_0'],
    on='zpid', how='inner'
)
word_count_freq

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,offer,park,perfect,property,room,second,space,spacious,storage,window
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,4,1,0,1,3,1,3,1,0,0
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0,0,0,0,0,0,0,0,0,0
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,1,1,2,0,3,0,1,1,2,1
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,1,0,0,2,1,0,1,0,0,0
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,3,1,2,2,2,0,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0,0,0,2,0,0,0,0,0,0
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,3,1,3,0,1,0,3,2,0,0
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0,3,1,0,2,2,2,0,0,0
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,1,0,2,2,2,0,3,1,3,0


In [None]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Frequency
            df = pd.merge(
                df,
                X_combined_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 00:10:01

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7528
Adjusted R-squared: 0.7104
RMSE: 0.3213

############## OLS ##############
Current Time (KST): 2024-10-22 00:10:01

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7670
Adjusted R-squared: 0.7346
RMSE: 0.2987

############## OLS ##############
Current Time (KST): 2024-10-22 00:10:01

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6252
Adjusted R-squared: 0.5154
RMSE: 0.2260

############## OLS ##############
Current Time (KST): 2024-10-22 00:10:01

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7895
Adjusted R-squared: 0.7614
RMSE: 0.3358

############## OLS ##############
Current Time (KST): 2024-10-22 00:10:01

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7829
Adjusted R-squared: 0.7618
RMSE: 0.2598

############## OLS ###########

In [None]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Frequency
            df = pd.merge(
                df,
                X_combined_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 01:19:48

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7467
Adjusted R-squared: 0.6815
RMSE: 0.3250

############## OLS ##############
Current Time (KST): 2024-10-22 01:19:48

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7674
Adjusted R-squared: 0.7191
RMSE: 0.2986

############## OLS ##############
Current Time (KST): 2024-10-22 01:19:48

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6157
Adjusted R-squared: 0.4387
RMSE: 0.2285

############## OLS ##############
Current Time (KST): 2024-10-22 01:19:48

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7933
Adjusted R-squared: 0.7534
RMSE: 0.3336

############## OLS ##############
Current Time (KST): 2024-10-22 01:19:48

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7864
Adjusted R-squared: 0.7562
RMSE: 0.2603

############## OLS ###########

In [None]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Frequency
            df = pd.merge(
                df,
                X_combined_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 02:53:07

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7548
Adjusted R-squared: 0.6917
RMSE: 0.3198

############## OLS ##############
Current Time (KST): 2024-10-22 02:53:07

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7667
Adjusted R-squared: 0.7184
RMSE: 0.2990

############## OLS ##############
Current Time (KST): 2024-10-22 02:53:07

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6309
Adjusted R-squared: 0.4610
RMSE: 0.2240

############## OLS ##############
Current Time (KST): 2024-10-22 02:53:07

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7933
Adjusted R-squared: 0.7534
RMSE: 0.3336

############## OLS ##############
Current Time (KST): 2024-10-22 02:53:07

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7829
Adjusted R-squared: 0.7521
RMSE: 0.2625

############## OLS ###########

### Binary (T/F)

In [None]:
word_count_binary = pd.merge(
    house_attribute_basic,
    X_combined_dict['binary_CH_0'],
    on='zpid', how='inner'
)
word_count_binary

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,offer,park,perfect,property,room,second,space,spacious,storage,window
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,1,1,0,1,1,1,1,1,0,0
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0,0,0,0,0,0,0,0,0,0
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,1,1,1,0,1,0,1,1,1,1
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,1,0,0,1,1,0,1,0,0,0
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,1,1,1,1,1,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0,0,0,1,0,0,0,0,0,0
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,1,1,1,0,1,0,1,1,0,0
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0,1,1,0,1,1,1,0,0,0
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,1,0,1,1,1,0,1,1,1,0


In [None]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Binary
            df = pd.merge(
                df,
                X_combined_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 09:06:13

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7749
Adjusted R-squared: 0.7363
RMSE: 0.3066

############## OLS ##############
Current Time (KST): 2024-10-22 09:06:13

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7802
Adjusted R-squared: 0.7496
RMSE: 0.2901

############## OLS ##############
Current Time (KST): 2024-10-22 09:06:13

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6267
Adjusted R-squared: 0.5173
RMSE: 0.2255

############## OLS ##############
Current Time (KST): 2024-10-22 09:06:13

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7861
Adjusted R-squared: 0.7576
RMSE: 0.3385

############## OLS ##############
Current Time (KST): 2024-10-22 09:06:14

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7802
Adjusted R-squared: 0.7587
RMSE: 0.2615

############## OLS ###########

In [None]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Binary
            df = pd.merge(
                df,
                X_combined_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 10:06:11

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7664
Adjusted R-squared: 0.7063
RMSE: 0.3121

############## OLS ##############
Current Time (KST): 2024-10-22 10:06:12

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7804
Adjusted R-squared: 0.7349
RMSE: 0.2901

############## OLS ##############
Current Time (KST): 2024-10-22 10:06:12

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6190
Adjusted R-squared: 0.4436
RMSE: 0.2275

############## OLS ##############
Current Time (KST): 2024-10-22 10:06:12

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7910
Adjusted R-squared: 0.7506
RMSE: 0.3355

############## OLS ##############
Current Time (KST): 2024-10-22 10:06:12

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7836
Adjusted R-squared: 0.7530
RMSE: 0.2620

############## OLS ###########

In [None]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Binary
            df = pd.merge(
                df,
                X_combined_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Llama Extract
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 11:29:37

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7737
Adjusted R-squared: 0.7154
RMSE: 0.3072

############## OLS ##############
Current Time (KST): 2024-10-22 11:29:37

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7809
Adjusted R-squared: 0.7354
RMSE: 0.2898

############## OLS ##############
Current Time (KST): 2024-10-22 11:29:37

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6255
Adjusted R-squared: 0.4530
RMSE: 0.2256

############## OLS ##############
Current Time (KST): 2024-10-22 11:29:37

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7877
Adjusted R-squared: 0.7467
RMSE: 0.3381

############## OLS ##############
Current Time (KST): 2024-10-22 11:29:37

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.7800
Adjusted R-squared: 0.7489
RMSE: 0.2642

############## OLS ###########

## Word Embedding

### Word2Vec

In [13]:
house_attribute_basic['zpid'] = house_attribute_basic['zpid'].astype(int)

word_embedding_w2v = pd.merge(
    house_attribute_basic,
    w2v_emb,
    on='zpid', how='inner'
)
word_embedding_w2v

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,embed_246,embed_247,embed_248,embed_249,embed_250,embed_251,embed_252,embed_253,embed_254,embed_255
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.223464,-0.007206,-0.162735,-0.102017,-0.255791,-0.041559,-0.251558,0.012404,-0.272424,0.197628
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.183473,-0.027997,0.089133,-0.094464,-0.156036,0.259032,-0.106731,-0.028881,-0.244925,-0.213802
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.213268,-0.020160,-0.143695,-0.115935,-0.208955,-0.126121,-0.210337,0.065599,-0.239116,0.223258
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.089511,-0.163466,0.002611,-0.133744,-0.334398,0.209188,-0.059271,0.102027,-0.206558,0.008846
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,0.203364,-0.096827,-0.013891,-0.004618,-0.180039,0.130514,-0.172449,0.036378,-0.158418,-0.040934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.131782,-0.112113,0.044428,-0.099765,-0.282524,0.317623,-0.061352,0.023665,-0.231226,-0.136804
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.221391,0.013927,-0.062395,-0.081934,-0.166114,0.029040,-0.193462,0.014361,-0.242746,0.044690
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.177176,-0.060876,-0.138395,-0.142217,-0.302656,-0.023339,-0.187376,0.074042,-0.262774,0.232845
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.162928,-0.063157,-0.047558,-0.115524,-0.230991,0.053731,-0.178668,0.065665,-0.189680,0.092720


In [14]:
word_embedding_w2v_pca = pd.merge(
    house_attribute_basic,
    w2v_pca,
    on='zpid', how='inner'
)
word_embedding_w2v_pca

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,-0.012387,-0.000549,0.004977,0.001819,0.002223,-0.004770,0.007689,0.000074,-0.006769,-0.000810
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.007043,0.006386,-0.021022,-0.010440,-0.007260,0.007682,-0.008644,-0.005621,0.008528,-0.000807
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.017369,0.010176,-0.006706,-0.002855,-0.002011,0.000952,-0.005180,-0.006513,0.001675,0.005536
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.001276,-0.004725,0.017497,0.021998,0.022062,-0.002967,-0.013703,0.017604,-0.001542,-0.004684
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.009105,0.006408,0.003389,-0.000759,-0.003152,0.000812,-0.000406,0.002651,0.004350,0.000274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.009106,-0.003481,0.005066,-0.010062,-0.000403,-0.008653,-0.016338,0.010330,0.004095,-0.012030
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.010199,0.005891,-0.007049,0.005434,0.000235,-0.010645,-0.003536,0.001607,-0.003288,0.000053
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.000281,-0.001413,-0.016239,0.003292,0.001671,0.000450,0.002700,0.005777,-0.005455,-0.002098
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,-0.006139,0.008565,0.007564,0.008352,0.000864,-0.005409,0.006516,0.005086,0.001877,0.002017


In [None]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Word2Vec
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                w2v_pca,
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 12:42:34

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7979
Adjusted R-squared: 0.7633
RMSE: 0.2905

############## OLS ##############
Current Time (KST): 2024-10-22 12:42:34

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8040
Adjusted R-squared: 0.7767
RMSE: 0.2739

############## OLS ##############
Current Time (KST): 2024-10-22 12:42:34

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6516
Adjusted R-squared: 0.5495
RMSE: 0.2179

############## OLS ##############
Current Time (KST): 2024-10-22 12:42:34

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8180
Adjusted R-squared: 0.7938
RMSE: 0.3122

############## OLS ##############
Current Time (KST): 2024-10-22 12:42:34

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8142
Adjusted R-squared: 0.7961
RMSE: 0.2404

############## OLS ###########

In [26]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Word2Vec
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                w2v_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 16:35:49

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7920
Adjusted R-squared: 0.7385
RMSE: 0.2945

############## OLS ##############
Current Time (KST): 2024-10-22 16:35:49

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8016
Adjusted R-squared: 0.7604
RMSE: 0.2758

############## OLS ##############
Current Time (KST): 2024-10-22 16:35:49

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6437
Adjusted R-squared: 0.4797
RMSE: 0.2201

############## OLS ##############
Current Time (KST): 2024-10-22 16:35:49

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8185
Adjusted R-squared: 0.7834
RMSE: 0.3127

############## OLS ##############
Current Time (KST): 2024-10-22 16:35:50

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8145
Adjusted R-squared: 0.7883
RMSE: 0.2426

############## OLS ###########

In [27]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Word2Vec
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                w2v_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-22 20:43:55

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7949
Adjusted R-squared: 0.7421
RMSE: 0.2924

############## OLS ##############
Current Time (KST): 2024-10-22 20:43:55

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8034
Adjusted R-squared: 0.7626
RMSE: 0.2745

############## OLS ##############
Current Time (KST): 2024-10-22 20:43:55

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6473
Adjusted R-squared: 0.4849
RMSE: 0.2189

############## OLS ##############
Current Time (KST): 2024-10-22 20:43:55

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8170
Adjusted R-squared: 0.7816
RMSE: 0.3139

############## OLS ##############
Current Time (KST): 2024-10-22 20:43:55

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8126
Adjusted R-squared: 0.7861
RMSE: 0.2439

############## OLS ###########

### BERT

In [15]:
word_embedding_bert = pd.merge(
    house_attribute_basic,
    bert_emb,
    on='zpid', how='inner'
)
word_embedding_bert

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.002209,-0.066249,0.021836,0.003349,-0.158957,-0.161074,-0.080721,-0.021704,0.088047,-0.194490
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.110557,-0.127086,0.028052,-0.063860,-0.154856,-0.082603,-0.231665,0.029396,-0.021418,-0.165803
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.195316,-0.083266,-0.072137,0.110479,-0.169599,-0.422717,0.066835,0.048301,0.037172,-0.148788
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.294773,-0.068021,0.086495,-0.140743,-0.201618,-0.158005,-0.034060,0.108299,-0.008776,-0.069019
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.095778,-0.153731,0.040213,0.025163,-0.375998,-0.533569,-0.005384,-0.109406,-0.059209,-0.185234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.345192,-0.050782,0.051460,-0.269730,-0.079036,-0.170702,-0.007881,0.125350,-0.039494,-0.024820
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.241844,-0.181451,0.059410,-0.060932,-0.257907,-0.303960,-0.014816,0.020160,0.017476,-0.196169
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.141273,-0.121933,-0.098506,-0.013390,-0.206108,-0.273568,-0.033298,0.070359,0.039624,-0.188428
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,-0.144971,-0.167624,0.057370,-0.098211,-0.287077,-0.235150,0.185234,-0.016989,0.015423,-0.056939


In [16]:
word_embedding_bert_pca = pd.merge(
    house_attribute_basic,
    bert_pca,
    on='zpid', how='inner'
)
word_embedding_bert_pca

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.074151,0.302540,-0.297929,-0.219099,-0.074798,-0.013947,0.019658,0.241600,0.000217,0.173076
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,-0.057093,-0.251500,0.072636,-0.003742,0.299997,-0.155731,0.103498,-0.163015,0.224072,0.433368
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.180451,-0.125636,-0.327241,0.090504,-0.092460,0.099371,0.127525,-0.074272,0.203362,0.071731
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,-0.057523,0.155976,0.293204,0.036025,0.807017,-0.016831,0.208986,0.036916,-0.378630,0.345290
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.100179,-0.047546,0.354586,-0.002388,-0.140394,-0.037001,-0.225810,-0.072347,-0.053413,0.135207
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.060178,0.171788,0.032892,0.253508,0.482016,-0.016670,0.441367,0.030835,0.032977,0.269307
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.250856,0.156176,0.074924,-0.049251,-0.024537,-0.292212,0.014764,0.048038,0.109006,0.033126
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.220507,0.220416,-0.074999,0.003914,-0.014332,0.142471,-0.117733,-0.120593,-0.038260,0.091872
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,-0.026316,0.042256,0.011237,0.204519,0.303453,-0.038491,-0.028662,0.222102,-0.232738,-0.193949


In [19]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # BERT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                bert_pca,
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-23 09:17:29

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7767
Adjusted R-squared: 0.7385
RMSE: 0.3053

############## OLS ##############
Current Time (KST): 2024-10-23 09:17:29

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7772
Adjusted R-squared: 0.7462
RMSE: 0.2921

############## OLS ##############
Current Time (KST): 2024-10-23 09:17:29

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6847
Adjusted R-squared: 0.5924
RMSE: 0.2073

############## OLS ##############
Current Time (KST): 2024-10-23 09:17:29

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8060
Adjusted R-squared: 0.7802
RMSE: 0.3224

############## OLS ##############
Current Time (KST): 2024-10-23 09:17:29

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8132
Adjusted R-squared: 0.7950
RMSE: 0.2410

############## OLS ###########

In [20]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # BERT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                bert_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-23 13:05:25

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7755
Adjusted R-squared: 0.7177
RMSE: 0.3060

############## OLS ##############
Current Time (KST): 2024-10-23 13:05:25

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7748
Adjusted R-squared: 0.7281
RMSE: 0.2938

############## OLS ##############
Current Time (KST): 2024-10-23 13:05:25

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6696
Adjusted R-squared: 0.5175
RMSE: 0.2119

############## OLS ##############
Current Time (KST): 2024-10-23 13:05:25

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8068
Adjusted R-squared: 0.7694
RMSE: 0.3226

############## OLS ##############
Current Time (KST): 2024-10-23 13:05:25

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8108
Adjusted R-squared: 0.7840
RMSE: 0.2450

############## OLS ###########

In [21]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # BERT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                bert_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-23 17:14:07

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7779
Adjusted R-squared: 0.7207
RMSE: 0.3043

############## OLS ##############
Current Time (KST): 2024-10-23 17:14:07

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7743
Adjusted R-squared: 0.7275
RMSE: 0.2942

############## OLS ##############
Current Time (KST): 2024-10-23 17:14:07

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6788
Adjusted R-squared: 0.5308
RMSE: 0.2089

############## OLS ##############
Current Time (KST): 2024-10-23 17:14:07

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8048
Adjusted R-squared: 0.7670
RMSE: 0.3243

############## OLS ##############
Current Time (KST): 2024-10-23 17:14:07

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8096
Adjusted R-squared: 0.7826
RMSE: 0.2458

############## OLS ###########

## Text Embedding

### Sentence TF

In [22]:
text_embedding_stf = pd.merge(
    house_attribute_basic,
    stf_emb,
    on='zpid', how='inner'
)
text_embedding_stf

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,emb_374,emb_375,emb_376,emb_377,emb_378,emb_379,emb_380,emb_381,emb_382,emb_383
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,0.014884,0.030813,-0.072282,-0.041126,-0.004638,0.042128,0.069364,-0.022453,-0.150843,0.045339
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.098454,-0.018951,0.023943,0.022205,0.007497,0.083833,0.027238,-0.026806,-0.049773,0.020240
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,0.067548,-0.046245,-0.066852,0.022917,0.057328,0.096687,0.042444,0.023923,-0.088812,0.072610
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.042285,0.000283,-0.018360,0.064053,0.068314,0.040049,0.006282,-0.032294,-0.030286,0.005316
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,0.047137,0.013095,-0.091455,0.000339,0.050769,0.035730,-0.025520,0.042480,-0.059526,0.028172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.029160,0.039548,0.012395,-0.019703,-0.020704,-0.014466,-0.054416,0.015904,-0.073100,0.021451
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,0.061723,0.015087,-0.034785,-0.012646,0.005323,0.052792,0.082454,-0.029282,-0.079274,0.040100
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,0.050333,0.068426,-0.092724,-0.029884,0.001411,0.128324,0.086248,-0.030514,-0.113628,0.006726
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.044795,-0.010378,-0.025361,0.004221,0.015714,0.018236,0.082321,0.002339,-0.087777,0.018066


In [23]:
text_embedding_stf_pca = pd.merge(
    house_attribute_basic,
    stf_pca,
    on='zpid', how='inner'
)
text_embedding_stf_pca

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,-0.052005,0.004084,0.094228,-0.126335,-0.057718,-0.014378,0.041700,-0.037116,0.122407,0.014602
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,0.008417,-0.051992,-0.013960,-0.025140,0.034091,-0.079997,0.026623,0.075764,0.037496,0.048367
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.008463,0.063568,0.010244,-0.048827,0.011803,0.051097,0.167714,0.033750,-0.017540,-0.076804
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,0.074449,0.018354,-0.040208,0.078236,0.076124,0.030177,0.053090,-0.024083,-0.005804,0.002427
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.081950,-0.062769,0.198024,0.013443,0.006049,-0.035317,0.012005,-0.000749,0.057624,0.063310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,0.079875,-0.009341,-0.154927,-0.012613,0.074548,-0.061339,0.080912,-0.012805,0.066575,-0.039803
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.000482,0.068557,0.089715,-0.069765,0.009431,-0.159541,-0.088961,-0.052833,0.055982,0.022920
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.035491,0.005164,-0.040089,-0.097735,0.062097,0.035229,0.021420,-0.051228,0.095142,-0.016373
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.022669,-0.105762,-0.063176,0.030789,0.009812,0.016110,-0.105148,-0.089061,-0.048036,0.144044


In [24]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Sentence TF (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                stf_pca,
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-23 21:12:57

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7874
Adjusted R-squared: 0.7510
RMSE: 0.2980

############## OLS ##############
Current Time (KST): 2024-10-23 21:12:57

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7823
Adjusted R-squared: 0.7520
RMSE: 0.2887

############## OLS ##############
Current Time (KST): 2024-10-23 21:12:57

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6968
Adjusted R-squared: 0.6079
RMSE: 0.2033

############## OLS ##############
Current Time (KST): 2024-10-23 21:12:57

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8037
Adjusted R-squared: 0.7776
RMSE: 0.3243

############## OLS ##############
Current Time (KST): 2024-10-23 21:12:57

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8136
Adjusted R-squared: 0.7955
RMSE: 0.2408

############## OLS ###########

In [25]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Sentence TF (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                stf_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-24 00:54:06

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7778
Adjusted R-squared: 0.7207
RMSE: 0.3044

############## OLS ##############
Current Time (KST): 2024-10-24 00:54:06

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7812
Adjusted R-squared: 0.7359
RMSE: 0.2896

############## OLS ##############
Current Time (KST): 2024-10-24 00:54:06

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6828
Adjusted R-squared: 0.5368
RMSE: 0.2076

############## OLS ##############
Current Time (KST): 2024-10-24 00:54:07

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8045
Adjusted R-squared: 0.7667
RMSE: 0.3245

############## OLS ##############
Current Time (KST): 2024-10-24 00:54:07

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8136
Adjusted R-squared: 0.7872
RMSE: 0.2432

############## OLS ###########

In [26]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Sentence TF (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                stf_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-24 04:59:44

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7846
Adjusted R-squared: 0.7291
RMSE: 0.2997

############## OLS ##############
Current Time (KST): 2024-10-24 04:59:44

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.7784
Adjusted R-squared: 0.7325
RMSE: 0.2915

############## OLS ##############
Current Time (KST): 2024-10-24 04:59:44

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6889
Adjusted R-squared: 0.5457
RMSE: 0.2056

############## OLS ##############
Current Time (KST): 2024-10-24 04:59:44

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8010
Adjusted R-squared: 0.7625
RMSE: 0.3274

############## OLS ##############
Current Time (KST): 2024-10-24 04:59:44

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8113
Adjusted R-squared: 0.7845
RMSE: 0.2447

############## OLS ###########

### GPT

In [14]:
text_embedding_gpt = pd.merge(
    house_attribute_basic,
    gpt_emb,
    on='zpid', how='inner'
)
text_embedding_gpt

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,emb_1526,emb_1527,emb_1528,emb_1529,emb_1530,emb_1531,emb_1532,emb_1533,emb_1534,emb_1535
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,-0.033456,-0.015177,-0.001198,-0.005905,0.011247,-0.010914,0.019671,-0.012488,-0.022106,0.002256
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,-0.009025,-0.018181,-0.008699,0.035555,0.030360,0.011361,-0.034511,0.001933,-0.028226,-0.038306
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.015792,-0.028441,-0.002742,-0.001154,0.018998,-0.007767,-0.001827,-0.031192,-0.024561,-0.022903
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,-0.021536,0.007333,-0.001665,-0.006363,0.022548,-0.014512,-0.035048,0.018000,0.005333,0.009161
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.050893,-0.023756,-0.012986,-0.012454,0.010501,-0.011026,0.023719,0.016643,-0.034138,-0.021690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.016708,-0.006777,0.008712,0.021234,0.024912,-0.019825,0.011537,0.011638,-0.013161,-0.000452
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.007859,-0.027445,-0.015098,0.003721,-0.000903,0.007347,0.012851,0.002217,-0.023834,-0.025438
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.024863,-0.039567,-0.014109,0.004888,0.036351,-0.014040,0.000675,0.004579,-0.031085,-0.003055
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,-0.028216,-0.009952,-0.007028,-0.028501,0.025817,-0.020604,0.021201,-0.010510,-0.011495,-0.013207


In [15]:
text_embedding_gpd_pca = pd.merge(
    house_attribute_basic,
    gpt_pca,
    on='zpid', how='inner'
)
text_embedding_gpd_pca

Unnamed: 0,zpid,ln(PRICE),ln(TOM),PARKING,BATHROOM,BEDROOM,AGE,LIVING,NUM_WORDS,CH_North,...,pca_22,pca_23,pca_24,pca_25,pca_26,pca_27,pca_28,pca_29,pca_30,pca_31
0,4171164,12.641097,4.905275,0.5,0.6,0.666667,0.578571,0.413631,0.341732,0,...,-0.028487,0.027478,0.126791,0.033092,0.062248,-0.071335,-0.003843,0.013785,-0.065925,-0.039378
1,4016685,11.608236,4.700480,0.0,0.0,0.500000,0.735714,0.150838,0.092913,0,...,-0.048391,0.017432,-0.146255,0.056143,0.116275,-0.011700,0.012689,-0.032729,-0.058205,0.046238
2,3638932,13.171154,4.430817,0.3,0.2,0.500000,0.714286,0.232849,0.294488,0,...,-0.086469,0.060097,-0.075760,0.050342,-0.032496,0.079844,-0.059140,0.092674,-0.058247,0.003079
3,4116360,11.149082,4.262680,0.4,0.0,0.500000,0.721429,0.112849,0.118110,0,...,-0.058519,0.050911,-0.021970,0.077693,0.019420,-0.075966,-0.036508,0.102648,0.141924,0.082704
4,3946804,12.611538,3.761200,0.4,0.2,0.500000,0.478571,0.102570,0.384252,0,...,-0.022647,-0.085769,-0.000497,-0.031012,0.013334,0.007824,0.043516,0.007546,-0.026471,-0.008417
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,4147854,11.835009,4.060443,0.4,0.2,0.666667,0.507143,0.169832,0.089764,0,...,-0.064886,0.072825,0.044630,-0.029737,-0.084064,0.046525,-0.005698,-0.026777,0.098470,-0.022504
1506,4020822,12.799399,4.262680,0.8,0.4,0.833333,0.721429,0.508380,0.428346,0,...,-0.029589,-0.000705,-0.005526,0.018271,0.001875,-0.070074,0.058544,-0.033463,0.015966,0.010869
1507,159464747,13.151922,3.555348,0.4,0.4,0.666667,0.071429,0.364469,0.316535,0,...,-0.042249,0.000601,0.061001,0.010168,0.020987,0.021042,-0.046701,0.098977,0.008302,-0.039665
1508,3619052,13.779883,3.433987,0.5,0.4,0.666667,0.700000,0.486034,0.376378,1,...,0.052522,0.006103,0.001694,0.037675,-0.015622,0.046321,0.001583,0.022674,0.032443,0.006758


In [16]:
# llama: None

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # GPT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                gpt_pca,
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-24 14:15:34

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.8020
Adjusted R-squared: 0.7681
RMSE: 0.2875

############## OLS ##############
Current Time (KST): 2024-10-24 14:15:34

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8214
Adjusted R-squared: 0.7965
RMSE: 0.2615

############## OLS ##############
Current Time (KST): 2024-10-24 14:15:34

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.7151
Adjusted R-squared: 0.6316
RMSE: 0.1970

############## OLS ##############
Current Time (KST): 2024-10-24 14:15:34

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8434
Adjusted R-squared: 0.8225
RMSE: 0.2897

############## OLS ##############
Current Time (KST): 2024-10-24 14:15:34

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8420
Adjusted R-squared: 0.8266
RMSE: 0.2217

############## OLS ###########

In [17]:
# llama: Freq

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # GPT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                gpt_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'freq_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-24 17:55:05

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7974
Adjusted R-squared: 0.7452
RMSE: 0.2907

############## OLS ##############
Current Time (KST): 2024-10-24 17:55:05

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8150
Adjusted R-squared: 0.7767
RMSE: 0.2663

############## OLS ##############
Current Time (KST): 2024-10-24 17:55:05

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.6955
Adjusted R-squared: 0.5553
RMSE: 0.2034

############## OLS ##############
Current Time (KST): 2024-10-24 17:55:05

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8444
Adjusted R-squared: 0.8143
RMSE: 0.2895

############## OLS ##############
Current Time (KST): 2024-10-24 17:55:05

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8416
Adjusted R-squared: 0.8192
RMSE: 0.2242

############## OLS ###########

In [18]:
# llama: Binary

for model_name, model_function in models_cpu:
    for city in ['CH', 'NY', 'LA']:
        for single in [0, 1]:
            # Time
            print(f'\n############## {model_name} ##############')
            kst = datetime.now(pytz.timezone('Asia/Seoul'))
            print(f'Current Time (KST): {kst.strftime("%Y-%m-%d %H:%M:%S")}')
            # House Attribute
            df = pd.merge(
                X_combined_dict[f'features_{city}_{single}'],
                X_combined_dict[f'dummies_{city}_{single}'],
                on='zpid', how='inner'
            )
            # GPT (PCA)
            df['zpid'] = df['zpid'].astype(int)
            df = pd.merge(
                df,
                gpt_pca,
                on='zpid', how='inner'
            )
            # Llama Extract
            df['zpid'] = df['zpid'].astype(str)
            df = pd.merge(
                df,
                llama_dict[f'binary_{city}_{single}'],
                on='zpid', how='inner'
            )
            # Fit
            model_function(df, city, single, 'ln(PRICE)')


############## OLS ##############
Current Time (KST): 2024-10-24 21:58:09

[Results for CH - 'Single Family']

Model Testing Metrics:
R-squared: 0.7993
Adjusted R-squared: 0.7477
RMSE: 0.2893

############## OLS ##############
Current Time (KST): 2024-10-24 21:58:09

[Results for CH - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8165
Adjusted R-squared: 0.7785
RMSE: 0.2652

############## OLS ##############
Current Time (KST): 2024-10-24 21:58:09

[Results for NY - 'Single Family']

Model Testing Metrics:
R-squared: 0.7077
Adjusted R-squared: 0.5731
RMSE: 0.1993

############## OLS ##############
Current Time (KST): 2024-10-24 21:58:09

[Results for NY - 'Condo/Townhouse']

Model Testing Metrics:
R-squared: 0.8430
Adjusted R-squared: 0.8127
RMSE: 0.2908

############## OLS ##############
Current Time (KST): 2024-10-24 21:58:09

[Results for LA - 'Single Family']

Model Testing Metrics:
R-squared: 0.8387
Adjusted R-squared: 0.8159
RMSE: 0.2262

############## OLS ###########