# Load Dataset

In [1]:
import pandas as pd
import geopandas as gpd

import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from transformers import pipeline
import torch

In [2]:
# Load embeddings or features from prior steps
# Placeholder: Replace with actual paths or download links
zillow_df = gpd.read_file('../dataset/raw/2. zillow_cleaned.geojson')
w2v_emb = pd.read_csv('../dataset/raw/4. w2v_embedding.csv')
w2v_pca = pd.read_csv('../dataset/raw/4. w2v_pca.csv')
bert_emb = pd.read_csv('../dataset/raw/4. bert_embedding.csv')
bert_pca = pd.read_csv('../dataset/raw/4. bert_pca.csv')
stf_emb = pd.read_csv('../dataset/raw/5. stf_embedding.csv')
stf_pca = pd.read_csv('../dataset/raw/5. stf_pca.csv')

In [3]:
import requests
from io import StringIO

# Dropbox direct download link
url_gpt_emb = 'https://www.dropbox.com/scl/fi/gcrk2mejy3su7nt9gf30i/5.-stf_embedding.csv?rlkey=d5uy0qm80geh81qxhxbtyms66&st=46sngliz&dl=1'

# Load directly into DataFrame
response = requests.get(url_gpt_emb)
if response.status_code == 200:
    gpt_emb = pd.read_csv(StringIO(response.text))
    print("CSV loaded successfully:", gpt_emb.shape)
else:
    print("Failed to fetch the file:", response.status_code)

CSV loaded successfully: (10111, 385)


In [4]:
# Dropbox direct download link
url_gpt_pca = 'https://www.dropbox.com/scl/fi/6gdiftk79r00a3uecf9zv/5.-stf_pca.csv?rlkey=j8e7e5tt81w2yt6fd3968mdwp&st=5wwf7fib&dl=1'

# Load directly into DataFrame
response = requests.get(url_gpt_pca)
if response.status_code == 200:
    gpt_pca = pd.read_csv(StringIO(response.text))
    print("CSV loaded successfully:", gpt_pca.shape)
else:
    print("Failed to fetch the file:", response.status_code)

CSV loaded successfully: (10111, 33)


In [5]:
df_all = zillow_df.copy()

zillow_df['zpid'] = zillow_df['zpid'].astype(str)
w2v_emb['zpid'] = w2v_emb['zpid'].astype(str)
bert_emb['zpid'] = bert_emb['zpid'].astype(str)
stf_emb['zpid'] = stf_emb['zpid'].astype(str)
gpt_emb['zpid'] = gpt_emb['zpid'].astype(str)
df_all = df_all.merge(w2v_emb, on="zpid", how="left")
df_all = df_all.merge(bert_emb, on="zpid", how="left", suffixes=("", "_bert"))
df_all = df_all.merge(stf_emb, on="zpid", how="left", suffixes=("", "_stf"))
df_all = df_all.merge(gpt_emb, on="zpid", how="left", suffixes=("", "_gpt"))

# Define Label: Fast-Selling (TOP 25%)

In [6]:
df_all['duration'] = df_all['duration'].astype(float)
df_all['city'] = df_all['city'].astype(str)

In [7]:
def assign_fast_label(group):
    threshold = group['duration'].quantile(0.25)
    return group['duration'] <= threshold

In [8]:
df_all['fast_label'] = df_all.groupby('city', group_keys=False).apply(assign_fast_label).astype(int)


  df_all['fast_label'] = df_all.groupby('city', group_keys=False).apply(assign_fast_label).astype(int)


# Setup LLaMA Pipeline

In [9]:
import torch
from transformers import pipeline

device = "cuda" if torch.cuda.is_available() else "cpu"
llama_32_1b = "meta-llama/Llama-3.2-1B-Instruct"
llama_32_3b = "meta-llama/Llama-3.2-3B-Instruct"
generator = pipeline(model=llama_32_3b, device=device, torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [10]:
import glob
import os
wordlist_dir = "../dataset/word_counts/0.25/"
wordlist_files = glob.glob(os.path.join(wordlist_dir, "*.csv"))

In [11]:
# Map from (city, type) to word list
topic_words = {}
for path in wordlist_files:
    basename = os.path.basename(path).replace("_zscore.csv", "")
    city_code, group_id, *_ = basename.split("_")
    city_map = {"CH": "Chicago", "NY": "New York", "LA": "Los Angeles"}
    city = city_map[city_code]
    house_type = "Single Family" if group_id == "0" else "Condo/Townhouse"
    df_words = pd.read_csv(path).sort_values("zscore", ascending=False).head(50)
    topic_words[(city, house_type)] = df_words['word'].tolist()

KeyError: 'zscore'

# Prompt templates

In [10]:
def build_zero_shot_prompt(row):
    return f"""
    <Task> Classify whether the house is fast-selling or not.
    <House Info>
    - Address: {row['address']}, {row['city']}
    - Type: {'Single Family' if row['single'] == 0 else 'Condo/Townhouse'}
    - Description: {row['description']}
    - Days on Market: {row['duration']}
    <Question> Is this a fast-selling house? Answer only 'Yes' or 'No'.
    """

In [11]:
def build_embedding_prompt(row, embedding_columns):
    embed_values = ", ".join([f"{col}: {row[col]:.4f}" for col in embedding_columns if not pd.isna(row[col])])
    return f"""
    <Task> Given the house description and its embedding values, classify whether the house is fast-selling.
    <Description>: {row['description']}
    <Embeddings>: {embed_values}
    <Question>: Is this a fast-selling house? Answer 'Yes' or 'No'.
    """

In [12]:
def build_few_shot_prompt(df_context, row):
    examples = "\n".join([
        f"Example {i+1} -> Description: {r['description']}\nFast Selling: {'Yes' if r['fast_label'] else 'No'}"
        for i, (_, r) in enumerate(df_context.iterrows())
    ])
    return f"""
    <Task> Determine whether the house is fast-selling based on its description.
    {examples}
    Now classify this house:
    Description: {row['description']}
    <Question> Fast Selling? Answer 'Yes' or 'No'.
    """


In [None]:
def build_topic_prompt(row):
    region = row['city']
    house_type = "Single Family" if row['single'] == 0 else "Condo/Townhouse"
    keywords = topic_words.get((region, house_type), [])
    wordlist = ", ".join(keywords)
    return f"""
    <Task> You are given a house listing. Below is a list of words that are frequently used in fast-selling houses in this region and house type.
    <Region>: {region}, <Type>: {house_type}
    <Keywords>: {wordlist}
    <Description>: {row['description']}
    <Question>: Based on the keywords and the description, is this house fast-selling? Answer strictly with 'Yes' or 'No'.
    """


In [13]:
# def run_llama_classification(df_subset, prompt_func, prompt_args=None):
#     predictions = []
#     for _, row in df_subset.iterrows():
#         if prompt_args:
#             prompt = prompt_func(row, **prompt_args)
#         else:
#             prompt = prompt_func(row)
#         result = generator(
#             prompt,
#             max_new_tokens=10,
#             do_sample=True,
#             temperature=0.7,
#             top_p=0.9,
#             pad_token_id=generator.tokenizer.eos_token_id
#         )[0]['generated_text']
#         predictions.append("Yes" in result)
#     return predictions

In [14]:
def run_llama_classification(df_subset, prompt_func, prompt_args=None):
    prompts = []
    for _, row in df_subset.iterrows():
        prompt = prompt_func(row, **prompt_args) if prompt_args else prompt_func(row)
        prompts.append(prompt)

    generations = generator(
        prompts,
        max_new_tokens=10,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=generator.tokenizer.eos_token_id
    )

    predictions = []
    for gen in generations:
        response = gen["generated_text"][-1]["content"] if isinstance(gen["generated_text"], list) else gen["generated_text"]
        answer = response.strip().split()[-1].lower()
        predictions.append("yes" in answer)
    return predictions

# Evaluate Metrics

In [15]:
def evaluate(y_true, y_pred):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))


# Results

In [16]:
# Prepare data sample
sample_df = df_all.dropna(subset=['description', 'duration']).sample(n=50, random_state=42)
y_true = sample_df['fast_label'].tolist()

## Zero-shot

In [21]:
print("--- Zero-shot ---")
y_pred_zero = run_llama_classification(sample_df, build_zero_shot_prompt)
evaluate(y_true, y_pred_zero)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


--- Zero-shot ---


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Accuracy: 0.22
Precision: 0.22
Recall: 1.0
F1 Score: 0.36065573770491804
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        39
           1       0.22      1.00      0.36        11

    accuracy                           0.22        50
   macro avg       0.11      0.50      0.18        50
weighted avg       0.05      0.22      0.08        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Embedding

In [None]:
# Word2Vec Embedding
print("--- With Word2Vec Embedding ---")
w2v_cols = [col for col in w2v_emb.columns if col != 'zpid']
y_pred_w2v = run_llama_classification(sample_df, build_embedding_prompt, {'embedding_columns': w2v_cols})
evaluate(y_true, y_pred_w2v)

--- With Word2Vec Embedding ---


In [None]:
# BERT Embedding
print("--- With BERT Embedding ---")
bert_cols = [col for col in bert_emb.columns if col != 'zpid']
y_pred_bert = run_llama_classification(sample_df, build_embedding_prompt, {'embedding_columns': bert_cols})
evaluate(y_true, y_pred_bert)

In [None]:
# Sentence Transformer Embedding
print("--- With Sentence Transformer Embedding ---")
stf_cols = [col for col in stf_emb.columns if col != 'zpid']
y_pred_stf = run_llama_classification(sample_df, build_embedding_prompt, {'embedding_columns': stf_cols})
evaluate(y_true, y_pred_stf)

In [None]:
# GPT Embedding
print("--- With GPT Embedding ---")
gpt_cols = [col for col in gpt_emb.columns if col != 'zpid']
y_pred_gpt = run_llama_classification(sample_df, build_embedding_prompt, {'embedding_columns': gpt_cols})
evaluate(y_true, y_pred_gpt)


In [None]:
# Few-shot (randomly pick 10 examples as context)
print("--- Few-shot ---")
few_shot_context = sample_df.sample(n=10, random_state=7)
remaining_df = sample_df.drop(few_shot_context.index)
y_pred_fewshot = run_llama_classification(remaining_df, lambda r: build_few_shot_prompt(few_shot_context, r))
evaluate(remaining_df['fast_label'].tolist(), y_pred_fewshot)

In [None]:
print("--- Keyword-Aware Prompt ---")
y_pred_keywords = run_llama_classification(sample_df, build_topic_prompt)
evaluate(y_true, y_pred_keywords)
append_result("Keyword-Aware", y_true, y_pred_keywords)

In [None]:
results_summary = []
def append_result(label, y_true, y_pred):
    results_summary.append({
        "Method": label,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred)
    })

append_result("Zero-shot", y_true, y_pred_zero)
append_result("Word2Vec", y_true, y_pred_w2v)
append_result("BERT", y_true, y_pred_bert)
append_result("SentenceTransformer", y_true, y_pred_stf)
append_result("GPT", y_true, y_pred_gpt)
append_result("Few-shot", remaining_df['fast_label'].tolist(), y_pred_fewshot)
append_result("Keyword-Aware Few-shot", remaining_df['Keyword-Aware'].tolist(), y_pred_keywords)


results_df = pd.DataFrame(results_summary)


print("
=== Performance Summary ===")
print(results_df.to_string(index=False))