In [6]:
import os
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
import numpy as np
from tqdm import tqdm
import csv
import pandas as pd

In [7]:
# Set your Hugging Face access token
os.environ['HF_TOKEN'] = ''

device = torch.device('cuda:0')

# Load the model and tokenizer with memory efficient settings
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Create config with memory optimizations
config = AutoConfig.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
config.use_cache = False

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
tokenizer.pad_token = tokenizer.eos_token

# Initialize model with memory optimizations
model = AutoModel.from_pretrained(
    model_id,
    config=config,
    torch_dtype=torch.bfloat16,
    use_auth_token=os.environ['HF_TOKEN'],
    low_cpu_mem_usage=True
).to(device)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# Read and process the input file
with open('../../prompts/new_prompts/raw_prompts_new.txt', 'r') as file:
    content = file.read()

prompts = content.split('--------------------------------------------------')
prompts = [prompt.strip() for prompt in prompts if prompt.strip()]

# Configure numpy to prevent scientific notation and set high precision
np.set_printoptions(suppress=True, precision=8, threshold=np.inf, linewidth=np.inf)

# Load dong names dynamically from dataset
def get_dong_names():
    df = pd.read_csv('../Data/Raw_data/AirBnB_data.csv')
    return list(df['Dong_name'].unique())

dong_names = get_dong_names()

# Define date range
date_range = pd.date_range(start='2017-01-01', end='2022-07-01', freq='MS')
date_range = date_range.strftime('%Y-%m-%d')

# Create full index for all possible combinations of dates and dongs
full_index = pd.MultiIndex.from_product([date_range, dong_names], names=['Reporting Month', 'Dong_name'])
full_df = pd.DataFrame(index=full_index).reset_index()

In [9]:
# Open the output CSV file
output_file = 'raw_embeddings_new.csv'
with open(output_file, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)

    # Write the header row
    max_embedding_length = 3072  # Replace with the actual maximum length of your embeddings if known
    header = ["Dong_name", "Reporting Month"] + [f"LLM Embeddings_{i}" for i in range(max_embedding_length)]
    csv_writer.writerow(header)

    # Collect embeddings to merge later
    embedding_rows = []

    # Process each prompt
    for i, prompt in enumerate(tqdm(prompts, desc="Processing prompts")):
        try:
            # Move input tensors to the same device as the model
            inputs = tokenizer(prompt, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)

            # Process one embedding at a time
            embedding = outputs.last_hidden_state.mean(dim=1).to(torch.float32).cpu().numpy()

            # Pad or trim embedding to match the header length
            embedding = embedding.flatten()
            if len(embedding) < max_embedding_length:
                embedding = np.pad(embedding, (0, max_embedding_length - len(embedding)), constant_values=0)
            else:
                embedding = embedding[:max_embedding_length]

            # Prepare the row data (replace placeholder values with actual prompt details if available)
            row = ["Dong_name_placeholder", "Reporting Month_placeholder"] + embedding.tolist()
            embedding_rows.append(row)

        except Exception as e:
            print(f"Error processing prompt {i}: {str(e)}")

        # Clear CUDA cache periodically
        if i % 100 == 0:
            torch.cuda.empty_cache()

# Convert embedding rows to a DataFrame
dong_embedding = pd.DataFrame(embedding_rows, columns=header)

# Merge with the full DataFrame to ensure all dongs and dates are included
merged_df = pd.merge(full_df, dong_embedding, on=['Reporting Month', 'Dong_name'], how='left')
merged_df.fillna(0, inplace=True)

# Write the merged DataFrame to the CSV
merged_df.to_csv(output_file, index=False)

  with torch.no_grad(), torch.cuda.amp.autocast():  # Use automatic mixed precision
Processing prompts: 100%|██████████| 37096/37096 [1:01:17<00:00, 10.09it/s]
