<a href="https://colab.research.google.com/github/tomonari-masada/course2024-nlp/blob/main/EDA_with_ELYZA_japanese_Llama_2_7b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from tqdm.auto import tqdm

import spacy
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    set_seed,
    BitsAndBytesConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
)
from transformers.modeling_outputs import ModelOutput
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

set_seed(123)

In [None]:
def accuracy(model, tokenizer, corpus, labels, batch_size=4):
    model.eval()
    num_correct_answers, num_answers = 0, 0
    for i in tqdm(range(0, len(corpus), batch_size)):
        texts = corpus[i:i+batch_size]
        encodings = tokenizer(texts, padding=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model.pretrained(**encodings)
        predicted = outputs.logits.argmax(-1)
        category = torch.tensor(labels[i:i+batch_size])
        num_correct_answers += (predicted == category).sum()
        num_answers += len(texts)
    model.train()
    return (num_correct_answers / num_answers).item()

In [None]:
def embed(model, tokenizer, corpus, batch_size=4):
    model.eval()
    pooled_hidden_states = []
    for i in tqdm(range(0, len(corpus), batch_size)):
        texts = corpus[i:i+batch_size]
        encodings = tokenizer(texts, padding=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model.pretrained.model(**encodings)
        last_hidden_state = outputs.last_hidden_state
        pad_token_id = model.pretrained.config.pad_token_id
        input_ids = encodings.input_ids
        sequence_lengths = torch.eq(input_ids, pad_token_id).int().argmax(-1)
        sequence_lengths = (sequence_lengths - 1) % input_ids.shape[-1]
        temp_batch_size = input_ids.shape[0]
        pooled_hidden_state = last_hidden_state[
            torch.arange(temp_batch_size, device=last_hidden_state.device),
            sequence_lengths]
        pooled_hidden_state = pooled_hidden_state.float().cpu().numpy()
        pooled_hidden_states.append(pooled_hidden_state)
    model.train()
    return np.concatenate(pooled_hidden_states)

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} "
        f"|| all params: {all_param} "
        f"|| trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
def show_trainable_parameters(model, show_all=False):
    for param_name, param in model.named_parameters():
        if param.requires_grad or show_all:
            print(param_name)

In [None]:
dataset = load_dataset(
    "shunk031/livedoor-news-corpus",
    train_ratio=0.8, val_ratio=0.1, test_ratio=0.1,
    random_state=42,
    shuffle=True,
    trust_remote_code=True,
)
num_categories = len(set(dataset["train"]["category"]))
max_seq_length = 512

* https://huggingface.co/docs/transformers/v4.41.2/en/main_classes/quantization#transformers.BitsAndBytesConfig

In [None]:
model_name = "elyza/ELYZA-japanese-Llama-2-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

pretrained = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_categories,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name, max_seq_length=max_seq_length,
)
tokenizer.pad_token = tokenizer.eos_token
pretrained.config.pad_token_id = pretrained.config.eos_token_id



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at elyza/ELYZA-japanese-Llama-2-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
pretrained

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()


In [None]:
print_trainable_parameters(pretrained)

trainable params: 131375104 || all params: 1750376448 || trainable%: 7.505534260936308


In [None]:
show_trainable_parameters(pretrained)

model.embed_tokens.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.input_layernorm.weight
model.layers.2.post_attention_layernorm.weight
model.layers.3.input_layernorm.weight
model.layers.3.post_attention_layernorm.weight
model.layers.4.input_layernorm.weight
model.layers.4.post_attention_layernorm.weight
model.layers.5.input_layernorm.weight
model.layers.5.post_attention_layernorm.weight
model.layers.6.input_layernorm.weight
model.layers.6.post_attention_layernorm.weight
model.layers.7.input_layernorm.weight
model.layers.7.post_attention_layernorm.weight
model.layers.8.input_layernorm.weight
model.layers.8.post_attention_layernorm.weight
model.layers.9.input_layernorm.weight
model.layers.9.post_attention_layernorm.weight
model.layers.10.input_layernorm.weight
model.layers.10.post_attention_layernorm.weight
model.layers.11.input_layernorm.weigh

In [None]:
class MyNetForClassification(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        self.pretrained = pretrained
        self.config = self.pretrained.config

    def forward(
        self, input_ids, category=None, attention_mask=None,
        output_attentions=None, output_hidden_states=None,
        return_dict=None, inputs_embeds=None, labels=None,
    ):
        outputs = self.pretrained(
            input_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(outputs.logits, category)
        return ModelOutput(
            loss=loss,
            logits=outputs.logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
model = MyNetForClassification(pretrained)

In [None]:
peft_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    output_dir="outputs_cls",
    label_names=["category"],
    max_steps=500,
    eval_steps=100,
    logging_steps=100,
    save_steps=100,
    learning_rate=5e-5,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="title",
    peft_config=peft_config,
)
trainer.train_dataset = trainer.train_dataset.add_column(
    "category", dataset["train"]["category"],
)
trainer.eval_dataset = trainer.eval_dataset.add_column(
    "category", dataset["validation"]["category"],
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
print_trainable_parameters(trainer.model)

trainable params: 79953920 || all params: 1830330368 || trainable%: 4.368278065962789


In [None]:
show_trainable_parameters(trainer.model)

base_model.model.pretrained.model.layers.0.self_attn.q_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.self_attn.q_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.self_attn.k_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.self_attn.k_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.self_attn.v_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.self_attn.v_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.self_attn.o_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.self_attn.o_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.mlp.gate_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.mlp.gate_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.mlp.up_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.mlp.up_proj.lora_B.default.weight
base_model.model.pretrained.mode

In [None]:
[param for param in model.pretrained.score.named_parameters()]

[('weight',
  Parameter containing:
  tensor([[-0.0277, -0.0182, -0.0232,  ..., -0.0117, -0.0247,  0.0282],
          [ 0.0176,  0.0106, -0.0070,  ..., -0.0302,  0.0188, -0.0125],
          [ 0.0138, -0.0232,  0.0195,  ...,  0.0371, -0.0304,  0.0184],
          ...,
          [ 0.0112, -0.0097, -0.0013,  ..., -0.0031, -0.0469,  0.0212],
          [ 0.0266,  0.0125,  0.0171,  ...,  0.0090, -0.0134, -0.0139],
          [ 0.0190,  0.0204, -0.0184,  ...,  0.0020, -0.0166,  0.0016]],
         device='cuda:0', dtype=torch.bfloat16))]

In [None]:
model.pretrained.score.weight.requires_grad = True

In [None]:
show_trainable_parameters(trainer.model)

base_model.model.pretrained.model.layers.0.self_attn.q_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.self_attn.q_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.self_attn.k_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.self_attn.k_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.self_attn.v_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.self_attn.v_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.self_attn.o_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.self_attn.o_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.mlp.gate_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.mlp.gate_proj.lora_B.default.weight
base_model.model.pretrained.model.layers.0.mlp.up_proj.lora_A.default.weight
base_model.model.pretrained.model.layers.0.mlp.up_proj.lora_B.default.weight
base_model.model.pretrained.mode

In [None]:
trainer.train()
#trainer.model.save_pretrained("models/lora/" + model_name)
#model = PeftModel.from_pretrained(model, "models/lora/" + model_name)

Step,Training Loss,Validation Loss
100,1.0256,0.437011


KeyboardInterrupt: 

In [None]:
accuracy(trainer.model, tokenizer, dataset["validation"]["title"], dataset["validation"]["category"])

  0%|          | 0/185 [00:00<?, ?it/s]

0.8860244154930115

In [None]:
embeddings = {}
for key in dataset:
    embeddings[key] = embed(trainer.model, tokenizer, dataset[key]["title"])
    #embeddings[key] = normalize(embeddings[key])

  0%|          | 0/1474 [00:00<?, ?it/s]

  0%|          | 0/185 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

In [None]:
label_pos_tags = ["NOUN", "VERB", "PROPN"]

nlp = spacy.load("ja_core_news_sm")
corpus = {}
for key in dataset:
    corpus[key] = []
    for text in tqdm(dataset[key]["title"]):
        corpus[key].append(" ".join(
            [token.lemma_
             for token in nlp(text) if token.pos_ in label_pos_tags
            ]
        ))

  0%|          | 0/5894 [00:00<?, ?it/s]

  0%|          | 0/737 [00:00<?, ?it/s]

  0%|          | 0/736 [00:00<?, ?it/s]

In [None]:
vectorizer = TfidfVectorizer(min_df=10, max_df=0.1, lowercase=False)
vectorizer.fit(corpus["train"])
vocab = np.array(vectorizer.get_feature_names_out())
X = {}
for key in dataset:
    X[key] = vectorizer.transform(corpus[key]).toarray()
vocab_embeddings = np.dot((X["train"] / X["train"].sum(0)).T, embeddings["train"])

In [None]:
n_clusters = 30
kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=123)
kmeans.fit(embeddings["train"])
centers = kmeans.cluster_centers_

In [None]:
unique, counts = np.unique(kmeans.labels_, return_counts=True)
size_dict = dict(zip(unique, counts))
print(sorted([item[1] for item in size_dict.items()]))

[43, 57, 61, 67, 67, 74, 103, 134, 148, 153, 155, 176, 181, 183, 184, 194, 216, 221, 225, 229, 233, 258, 263, 298, 307, 312, 313, 341, 342, 356]


In [None]:
topic_words = []
similarities = cosine_similarity(vocab_embeddings, centers)
for i in range(similarities.shape[-1]):
    indices = np.argsort(- similarities[:,i])
    topic_words.append(" ".join(list(vocab[indices[:20]])))
with open("topic_words.txt", "w") as f:
    f.write("\n".join(topic_words))
