In [1]:
import os
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# Set your Hugging Face access token
os.environ['HF_TOKEN'] = ''

device = torch.device('cuda:1')

# Load the model and tokenizer with memory efficient settings
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Create config with memory optimizations
config = AutoConfig.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
config.use_cache = False

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
tokenizer.pad_token = tokenizer.eos_token

# Initialize model with memory optimizations
model = AutoModel.from_pretrained(
    model_id,
    config=config,
    torch_dtype=torch.bfloat16,
    use_auth_token=os.environ['HF_TOKEN'],
    low_cpu_mem_usage=True
).to(device)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
print(model.config.hidden_size)   # 출력되는 임베딩 차원

3072


In [4]:
# Configure numpy settings
np.set_printoptions(suppress=True, precision=8, threshold=np.inf, linewidth=np.inf)

# Embedding 함수
def process_chunk(text, chunk_size=512):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=chunk_size,
        padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad(), torch.cuda.amp.autocast():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)

    embedding = embedding.to(torch.float32).cpu().numpy()
    del outputs
    torch.cuda.empty_cache()
    return embedding


In [5]:
def embed_dataframe(df, name="hf"):
    embeddings = []

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {name} prompts"):
        prompt = row['prompt']
        try:
            torch.cuda.empty_cache()
            embedding = process_chunk(prompt)
        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                try:
                    embedding = process_chunk(prompt, chunk_size=256)
                except Exception as e2:
                    print(f"[{name}] Prompt {i} failed (even reduced): {e2}")
                    embedding = None
            else:
                print(f"[{name}] Prompt {i} error: {e}")
                embedding = None
        except Exception as e:
            print(f"[{name}] Prompt {i} unexpected error: {e}")
            embedding = None

        if embedding is not None:
            embeddings.append(embedding.flatten())
        else:
            # 오류 발생 시 0 벡터 추가
            embeddings.append(np.zeros((model.config.hidden_size,), dtype=np.float32))

    # ➕ 임베딩을 개별 차원으로 컬럼화
    emb_array = np.stack(embeddings)  # (N, dim)
    emb_dim_cols = [f"dim_{i}" for i in range(emb_array.shape[1])]

    # DataFrame으로 병합
    emb_df = pd.concat([
        df[['Reporting Month', 'Dong_name']].reset_index(drop=True),
        pd.DataFrame(emb_array, columns=emb_dim_cols)
    ], axis=1)

    return emb_df


In [8]:
Airbnb_SSP_wo = pd.read_csv('../dong_prompts_new/AirBnB_SSP_wo_prompts.csv')
#Airbnb_SSP_w = pd.read_csv('../dong_prompts/AirBnB_SSP_w_prompts.csv')

In [9]:
Airbnb_SSP_wo

Unnamed: 0,Reporting Month,Dong_name,prompt
0,2017-01-01,혜화동,[2017-01-01 | 혜화동] AirBnB Feature Summary:Tota...
1,2017-01-01,사근동,[2017-01-01 | 사근동] AirBnB Feature Summary:Tota...
2,2017-01-01,연남동,[2017-01-01 | 연남동] AirBnB Feature Summary:Tota...
3,2017-01-01,우이동,[2017-01-01 | 우이동] AirBnB Feature Summary:Tota...
4,2017-01-01,사직동,[2017-01-01 | 사직동] AirBnB Feature Summary:Tota...
...,...,...,...
28403,2022-07-01,번3동,['[2022-07-01 | 번3동] AirBnB Feature Summary:To...
28404,2022-07-01,무악동,[2022-07-01 | 무악동] AirBnB Feature Summary:Tota...
28405,2022-07-01,쌍문2동,[2022-07-01 | 쌍문2동] AirBnB Feature Summary:Tot...
28406,2022-07-01,시흥4동,['[2022-07-01 | 시흥4동] AirBnB Feature Summary:T...


In [10]:
Airbnb_SSP_wo_emb = embed_dataframe(Airbnb_SSP_wo, 'Airbnb_SSP_wo')

  with torch.no_grad(), torch.cuda.amp.autocast():
Processing Airbnb_SSP_wo prompts: 100%|██████████| 28408/28408 [39:40<00:00, 11.93it/s]


In [11]:
Airbnb_SSP_wo_emb.to_csv('../../../Data/Preprocessed_data/Dong/llm_embeddings_new/Airbnb_SSP_wo.csv', index=False)
Airbnb_SSP_wo_emb

Unnamed: 0,Reporting Month,Dong_name,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,...,dim_3062,dim_3063,dim_3064,dim_3065,dim_3066,dim_3067,dim_3068,dim_3069,dim_3070,dim_3071
0,2017-01-01,혜화동,0.402124,0.562513,2.386748,0.230527,0.726897,-0.977159,0.442740,0.515273,...,-0.338610,0.719909,0.101741,0.545410,0.336146,-0.236467,0.278583,0.443061,-0.669606,-0.301908
1,2017-01-01,사근동,0.485903,0.467065,2.393437,0.063475,0.775888,-0.945809,0.773877,0.528708,...,-0.136568,0.742120,0.145293,0.336165,0.503586,-0.325827,0.360276,0.480543,-0.721132,-0.351424
2,2017-01-01,연남동,0.651933,0.453204,2.363519,0.125065,0.913903,-0.786058,-0.167108,0.254233,...,-0.190986,0.523659,-0.019588,0.455602,0.273901,-0.621257,0.471213,0.522027,-0.835706,-0.013737
3,2017-01-01,우이동,0.737667,0.702664,2.498673,-0.093888,0.643325,-0.995129,0.555084,0.160863,...,-0.094705,0.243936,0.090517,0.190380,0.584271,-0.367675,0.177601,0.185572,-0.369592,-0.045469
4,2017-01-01,사직동,0.797338,0.404755,2.310000,0.378532,0.769356,-0.882680,0.289017,0.577551,...,-0.222505,0.649363,-0.133364,0.686485,0.289828,-0.206030,0.382740,0.456553,-0.790512,-0.403850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28403,2022-07-01,번3동,-0.507205,0.781056,2.312848,-0.384414,0.364072,0.303697,0.564550,-0.222496,...,-0.421645,-0.290748,0.008716,-0.084469,-0.057195,1.004383,0.563636,0.181320,0.169426,0.267049
28404,2022-07-01,무악동,0.815413,0.832993,2.279110,-0.099906,0.816628,-0.991223,0.516129,0.278574,...,0.049577,0.208140,0.123904,0.177670,0.570998,-0.181488,0.353211,0.332999,-0.276283,-0.052463
28405,2022-07-01,쌍문2동,0.830771,0.687025,2.183712,-0.050262,0.742346,-0.860231,0.408812,0.298967,...,-0.030867,0.078712,0.182967,0.185928,0.455982,-0.099408,0.575790,0.483207,-0.336009,-0.116764
28406,2022-07-01,시흥4동,-0.492963,0.795166,2.227900,-0.115175,0.411488,0.452539,0.636927,-0.167286,...,-0.423148,-0.290463,-0.110753,-0.131565,-0.002980,1.105138,0.491228,0.554596,0.406384,0.277722


In [None]:
#Airbnb_SSP_w_emb = embed_dataframe(Airbnb_SSP_w, 'Airbnb_SSP_w')

In [None]:
#Airbnb_SSP_w_emb.to_csv('../../../Data/Preprocessed_data/Dong/llm_embeddings_new/Airbnb_SSP_w.csv', index=False)
#Airbnb_SSP_w_emb