In [1]:
import os
import pandas as pd
import openai
from dotenv import load_dotenv
from tqdm.notebook import tqdm
import time

In [2]:
# Load OpenAI key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
# Load the dataset
df = pd.read_csv("../data/processed/12k_properties.csv")

In [4]:
# Prepare generation prompt for each row
def create_prompt(row):
    return (
        f"Write a short and engaging real estate listing description (under 100 tokens) "
        f"for a property located at {row['address']}, {row['city']}, {row['state']} {row['postcode']}. "
        f"The property is a {row['property_type']} listed at ${row['price']:.0f}."
    )

df["prompt"] = df.apply(create_prompt, axis=1)

In [5]:
def generate_descriptions(prompts, model="gpt-3.5-turbo-instruct"):
    responses = []
    for prompt in tqdm(prompts, desc="Generating Descriptions"):
        try:
            response = openai.Completion.create(
                model=model,
                prompt=prompt,
                max_tokens=100,
                temperature=0.7,
                top_p=1.0,
                n=1,
            )
            text = response.choices[0].text.strip()
        except Exception as e:
            print(f"Error: {e}")
            text = ""
        responses.append(text)
        time.sleep(0.1)  # Avoid rate limits
    return responses

In [6]:
sample_prompts = df["prompt"].iloc[:10].tolist()
sample_descriptions = generate_descriptions(sample_prompts)

# Merge descriptions into dataframe
df.loc[:9, "description"] = sample_descriptions

# Save a preview CSV
df.iloc[:10].to_csv("../data/processed/12k_properties_with_descriptions_preview.csv", index=False)

Generating Descriptions:   0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
# Function to batch call OpenAI API
def generate_descriptions_batch(prompts, model="gpt-3.5-turbo-instruct"):
    responses = []
    for prompt in tqdm(prompts, desc="Generating batch"):
        try:
            response = openai.Completion.create(
                model=model,
                prompt=prompt,
                max_tokens=100,
                temperature=0.7,
                top_p=1.0,
                n=1,
            )
            text = response.choices[0].text.strip()
        except Exception as e:
            print(f"Error: {e}")
            text = ""
        responses.append(text)
        time.sleep(0.1)  # prevent rate limiting
    return responses

In [17]:
output_csv_path = "../data/processed/12k_properties_with_descriptions.csv"

if "prompt" not in df.columns:
    df["prompt"] = df.apply(create_prompt, axis=1)

# Load existing progress if file exists
if os.path.exists(output_csv_path):
    saved_df = pd.read_csv(output_csv_path)
    df["description"] = saved_df["description"]

In [25]:
# Process in batches and resume
batch_size = 100
unprocessed_idx = df[df["description"] == "nan"].index

for start in tqdm(range(0, len(unprocessed_idx), batch_size), desc="Overall Progress"):
    batch_indices = unprocessed_idx[start:start + batch_size]
    if len(batch_indices) == 0:
        continue

    prompts = df.loc[batch_indices, "prompt"].tolist()
    responses = generate_descriptions_batch(prompts)

    df.loc[batch_indices, "description"] = responses

    # Save progress after each batch
    df.to_csv(output_csv_path, index=False)

Overall Progress:   0%|          | 0/92 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Error: The server is overloaded or not ready yet.


Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/100 [00:00<?, ?it/s]

Generating batch:   0%|          | 0/90 [00:00<?, ?it/s]