In [7]:
import os
from transformers import AutoTokenizer, AutoModel, AutoConfig
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# Set your Hugging Face access token
os.environ['HF_TOKEN'] = ''

device = torch.device('cuda:2')

# Load the model and tokenizer with memory efficient settings
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Create config with memory optimizations
config = AutoConfig.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
config.use_cache = False

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.environ['HF_TOKEN'])
tokenizer.pad_token = tokenizer.eos_token

# Initialize model with memory optimizations
model = AutoModel.from_pretrained(
    model_id,
    config=config,
    torch_dtype=torch.bfloat16,
    use_auth_token=os.environ['HF_TOKEN'],
    low_cpu_mem_usage=True
).to(device)




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
print(model.config.hidden_size)   # 출력되는 임베딩 차원

3072


In [10]:
# Configure numpy settings
np.set_printoptions(suppress=True, precision=8, threshold=np.inf, linewidth=np.inf)

# Embedding 함수
def process_chunk(text, chunk_size=512):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=chunk_size,
        padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad(), torch.cuda.amp.autocast():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)

    embedding = embedding.to(torch.float32).cpu().numpy()
    del outputs
    torch.cuda.empty_cache()
    return embedding

In [11]:
def embed_dataframe(df, name="hf"):
    embeddings = []

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {name} prompts"):
        prompt = row['prompt']
        try:
            torch.cuda.empty_cache()
            embedding = process_chunk(prompt)
        except RuntimeError as e:
            if "out of memory" in str(e):
                torch.cuda.empty_cache()
                try:
                    embedding = process_chunk(prompt, chunk_size=256)
                except Exception as e2:
                    print(f"[{name}] Prompt {i} failed (even reduced): {e2}")
                    embedding = None
            else:
                print(f"[{name}] Prompt {i} error: {e}")
                embedding = None
        except Exception as e:
            print(f"[{name}] Prompt {i} unexpected error: {e}")
            embedding = None

        if embedding is not None:
            embeddings.append(embedding.flatten())
        else:
            # 오류 발생 시 0 벡터 추가
            embeddings.append(np.zeros((model.config.hidden_size,), dtype=np.float32))

    # ➕ 임베딩을 개별 차원으로 컬럼화
    emb_array = np.stack(embeddings)  # (N, dim)
    emb_dim_cols = [f"dim_{i}" for i in range(emb_array.shape[1])]

    # DataFrame으로 병합
    emb_df = pd.concat([
        df[['Reporting Month', 'Dong_name']].reset_index(drop=True),
        pd.DataFrame(emb_array, columns=emb_dim_cols)
    ], axis=1)

    return emb_df


In [15]:
hf = pd.read_csv('../dong_prompts/human_flow_prompts.csv')
road = pd.read_csv('../dong_prompts/road_prompts.csv')

In [17]:
# 임베딩 생성
hf_emb = embed_dataframe(hf, 'human flow')

  with torch.no_grad(), torch.cuda.amp.autocast():
Processing human flow prompts: 100%|██████████| 28408/28408 [27:28<00:00, 17.23it/s]


In [21]:
hf_emb.to_csv('../../../Data/Preprocessed_data/Dong/llm_embeddings/human_flow_llm.csv', index=False)
hf_emb

Unnamed: 0,Reporting Month,Dong_name,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,...,dim_3062,dim_3063,dim_3064,dim_3065,dim_3066,dim_3067,dim_3068,dim_3069,dim_3070,dim_3071
0,2017-01-01,청운효자동,0.631657,0.062088,1.661948,-0.480083,0.325278,0.059335,1.219645,-0.214721,...,0.576094,0.604269,0.936134,1.193649,-0.245246,-0.182340,0.034380,-0.075218,-0.769296,-0.427419
1,2017-01-01,사직동,0.788676,0.019949,1.733202,-0.455450,0.363224,-0.014578,1.072209,-0.241832,...,0.513663,0.489251,0.876576,1.304210,-0.117332,-0.218697,0.008802,0.056241,-0.697531,-0.542157
2,2017-01-01,삼청동,0.732731,-0.060677,1.588081,-0.394658,0.461272,0.043342,1.138289,-0.348895,...,0.544804,0.491321,0.845081,1.334415,-0.113787,-0.190472,0.101940,-0.072523,-0.731949,-0.475726
3,2017-01-01,부암동,0.778128,0.039292,1.593942,-0.382553,0.370744,-0.037602,1.086960,-0.419720,...,0.459104,0.524115,0.876632,1.273649,-0.082777,-0.210168,0.099823,0.135457,-0.607994,-0.534211
4,2017-01-01,평창동,0.849853,0.121262,1.602125,-0.409218,0.387685,-0.006306,1.072251,-0.281716,...,0.561997,0.589886,0.964419,1.193343,-0.121586,-0.232168,-0.022254,0.127712,-0.629663,-0.618807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28403,2022-07-01,약수동,0.749300,-0.016589,1.730910,-0.433164,0.392690,-0.015127,1.158341,-0.225025,...,0.494893,0.402060,0.811578,1.310858,-0.205347,-0.265034,-0.025480,-0.019401,-0.652996,-0.544948
28404,2022-07-01,광희동,0.766798,0.030453,1.787404,-0.532283,0.314998,-0.052056,1.163009,-0.247011,...,0.584220,0.479670,0.863861,1.261009,-0.162101,-0.271595,-0.057617,-0.007161,-0.716689,-0.601915
28405,2022-07-01,이문1동,0.727369,0.046711,1.754294,-0.544140,0.384053,0.013268,1.110200,-0.253257,...,0.511407,0.421430,0.800825,1.275968,-0.165458,-0.218218,-0.074832,0.006860,-0.612701,-0.568621
28406,2022-07-01,장위1동,0.751910,0.083698,1.726383,-0.474055,0.305560,-0.088892,1.142029,-0.301987,...,0.521433,0.496424,0.903476,1.325101,-0.162519,-0.177862,0.033189,0.141140,-0.653885,-0.545823


In [19]:
# 임베딩 생성
road_emb = embed_dataframe(road, 'road')

  with torch.no_grad(), torch.cuda.amp.autocast():
Processing road prompts: 100%|██████████| 28408/28408 [24:30<00:00, 19.32it/s]


In [20]:
road_emb.to_csv('../../../Data/Preprocessed_data/Dong/llm_embeddings/road_llm.csv', index=False)
road_emb

Unnamed: 0,Reporting Month,Dong_name,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,...,dim_3062,dim_3063,dim_3064,dim_3065,dim_3066,dim_3067,dim_3068,dim_3069,dim_3070,dim_3071
0,2017-01-01,청운효자동,0.321834,0.340124,1.853458,-0.506423,-0.063985,-0.468818,0.556502,-0.428688,...,-0.244580,0.103105,0.545484,0.740482,0.078901,-0.257977,-0.101245,0.092059,-0.481388,-0.765549
1,2017-01-01,사직동,0.265761,0.502046,1.987182,-0.388202,0.149291,-0.565592,0.397939,-0.366245,...,-0.218343,0.044590,0.401821,0.893266,0.214721,-0.598675,-0.195248,0.287405,-0.194642,-0.753838
2,2017-01-01,삼청동,0.436833,0.416305,1.720308,-0.438655,0.159619,-0.646483,0.299109,-0.684790,...,-0.305329,0.072964,0.493513,0.956976,0.143234,-0.397812,-0.138427,0.155672,-0.110208,-0.764491
3,2017-01-01,부암동,0.482367,0.463486,1.864750,-0.348119,0.132736,-0.547067,0.349337,-0.516242,...,-0.226276,0.086268,0.493442,0.943564,0.248205,-0.473195,-0.074873,0.253727,-0.191987,-0.782241
4,2017-01-01,평창동,0.533973,0.527255,1.894979,-0.392078,0.221784,-0.535513,0.342288,-0.356292,...,-0.124963,0.184580,0.632024,0.725352,0.277057,-0.448154,-0.232927,0.285780,-0.286246,-0.865277
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28403,2022-07-01,약수동,0.442967,0.405879,1.908846,-0.438133,0.140215,-0.501379,0.465025,-0.379733,...,-0.368552,-0.092358,0.222611,0.885107,0.154167,-0.453216,-0.319466,0.220765,-0.432723,-0.910893
28404,2022-07-01,광희동,0.362489,0.486879,1.953672,-0.573097,0.094458,-0.542656,0.541597,-0.514116,...,-0.191027,0.067091,0.448186,0.896166,0.176934,-0.416564,-0.279562,0.185192,-0.365602,-0.902713
28405,2022-07-01,이문1동,0.385568,0.492133,1.941631,-0.414440,0.063800,-0.493783,0.493798,-0.368206,...,-0.294888,-0.003274,0.314463,0.843149,0.159412,-0.408726,-0.291548,0.208498,-0.237992,-0.715955
28406,2022-07-01,장위1동,0.417662,0.489624,1.984088,-0.456425,-0.000668,-0.440647,0.456797,-0.517259,...,-0.311643,-0.010786,0.404679,0.868279,0.152465,-0.436456,-0.198997,0.240103,-0.284801,-0.726293
