In [None]:
import matplotlib.pyplot as plt
import torch
import os
import pickle
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline, logging, Trainer, AutoConfig, LlamaConfig
import subprocess
from tqdm import tqdm

subprocess.run(["huggingface-cli", "login", "--token", "[your token]"])

def open_data_dict(file_path):
    with open(file_path, 'rb') as file:
        contents_dict = pickle.load(file)
        contents_dict = {int(key.split('.')[0]): value for key, value in contents_dict.items()}
    return contents_dict

input_dict = open_data_dict('/data/log-data-2024/20241123_Final/content_meta_dict_20241123.pickle')
input_dict

In [None]:
# 모델 토크나이저 설정
model_ckpt = "meta-llama/Meta-Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# 토큰 길이 계산
token_lengths = []

for key, text in input_dict.items():
    tokens = tokenizer(text, truncation=False)['input_ids']  # 토큰화
    token_lengths.append(len(tokens))  # 토큰 길이 추가

# 토큰 길이 분포 시각화
plt.figure(figsize=(10, 6))
plt.hist(token_lengths, bins=30, color='blue', alpha=0.7)
plt.title("Distribution of Token Lengths")
plt.xlabel("Token Length")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
# 모델 및 토크나이저 설정
model_ckpt = "meta-llama/Meta-Llama-3.1-8B"
torch_dtype = torch.float16

bnb_config = BitsAndBytesConfig(
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False
)

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_ckpt,
    low_cpu_mem_usage=True,
    quantization_config=bnb_config,
    torch_dtype=torch_dtype,
    device_map='auto'
)

model.config.use_cache = False

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
peft_config.inference_mode = False
model = get_peft_model(model, peft_config)

# Dictionary를 텍스트로 변환
embeddings_dict = {}

# 각 키에 대해 임베딩 계산
for key, text in tqdm(input_dict.items(), desc="Processing Texts", unit="entry"):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=False)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        embeddings = outputs.hidden_states[-1]  # 마지막 히든 레이어를 임베딩으로 사용 (torch.Size([1, N, 4096]), N은 토큰의 수)
        mean_embeddings = embeddings.mean(dim=1)  # [1, N, 4096] -> [1, 4096]로 평균화
        embeddings_dict[key] = mean_embeddings.cpu().numpy()  # 키와 임베딩을 dictionary에 저장
        embeddings_dict[key] = mean_embeddings.to(dtype=torch.float16)  # 키와 임베딩을 torch.float16 형태로 저장

embeddings_dict

In [None]:
def add_zero_tensor(embeddings_dict):
    zero_tensor = torch.zeros((1, 4096), dtype=torch.float16)
    embeddings_dict[0] = zero_tensor
    return embeddings_dict

embeddings_dict = add_zero_tensor(embeddings_dict)
print(embeddings_dict[0])
print(embeddings_dict[0].size())

print(len(embeddings_dict))

In [None]:
# 결과를 .pt 파일로 저장
output_file_path = './data/embedding_vec_dict_241123.pt'
torch.save(embeddings_dict, output_file_path)

print(f"Embeddings dictionary saved to {output_file_path}")

In [None]:
from sklearn.decomposition import PCA
import torch

embedding_matrix = torch.stack([embeddings_dict[key].squeeze(0) for key in embeddings_dict.keys()])
embedding_matrix = embedding_matrix.view(embedding_matrix.size(0), -1)

pca = PCA(n_components=128)  # or 512, 256, 128 ()
reduced_embeddings = pca.fit_transform(embedding_matrix.cpu().numpy())
reduced_embeddings = torch.tensor(reduced_embeddings, dtype=torch.float16)
embedding_vec_dict_reduced = {key: reduced_embeddings[i] for i, key in enumerate(embeddings_dict.keys())}

torch.save(embedding_vec_dict_reduced, './data/embedding_vec_dict_241123_reduced_128.pt')