<a href="https://colab.research.google.com/github/tomonari-masada/21K12017/blob/main/topic_modeling_with_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from tqdm.auto import tqdm

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    set_seed,
    BitsAndBytesConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
)
from transformers.modeling_outputs import ModelOutput
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

set_seed(123)

In [None]:
def accuracy(trainer, dataset, batch_size=4):
    trainer.model.eval()
    num_correct_answers, num_answers = 0, 0
    for i in tqdm(range(0, len(dataset), batch_size)):
        examples = dataset[i:i+batch_size]
        encodings = trainer.tokenizer(
            examples["title"],
            padding=True,
            return_tensors="pt",
        )
        category = torch.tensor(examples["category"])
        with torch.no_grad():
            outputs = trainer.model(**encodings, category=category)
        predicted = outputs.logits.argmax(-1)
        num_correct_answers += (predicted == category).sum()
        num_answers += len(examples["category"])
    trainer.model.train()
    return (num_correct_answers / num_answers).item()

In [None]:
def embed(trainer, dataset, batch_size=4):
    trainer.model.eval()
    pooled_hidden_states = []
    for i in tqdm(range(0, len(dataset), batch_size)):
        examples = dataset[i:i+batch_size]
        encodings = trainer.tokenizer(
            examples["title"],
            padding=True,
            return_tensors="pt",
        )
        with torch.no_grad():
            outputs = trainer.model.pretrained.model(**encodings)
        last_hidden_state = outputs.last_hidden_state
        pad_token_id = trainer.model.pretrained.config.pad_token_id
        input_ids = encodings.input_ids
        sequence_lengths = torch.eq(input_ids, pad_token_id).int().argmax(-1)
        sequence_lengths = (sequence_lengths - 1) % input_ids.shape[-1]
        temp_batch_size = input_ids.shape[0]
        pooled_hidden_state = last_hidden_state[
            torch.arange(temp_batch_size, device=last_hidden_state.device),
            sequence_lengths]
        pooled_hidden_state = pooled_hidden_state.float().cpu().numpy()
        pooled_hidden_states.append(pooled_hidden_state)
    trainer.model.train()
    return np.concatenate(pooled_hidden_states)

In [None]:
dataset = load_dataset(
    "shunk031/livedoor-news-corpus",
    train_ratio=0.8, val_ratio=0.1, test_ratio=0.1,
    random_state=42,
    shuffle=True,
    trust_remote_code=True,
)
num_categories = len(set(dataset["train"]["category"]))
max_seq_length = 512

In [None]:
model_name = "elyza/ELYZA-japanese-Llama-2-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_storage=torch.bfloat16,
)

pretrained = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_categories,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name, max_seq_length=max_seq_length,
)
tokenizer.pad_token = tokenizer.eos_token
pretrained.config.pad_token_id = pretrained.config.eos_token_id



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at elyza/ELYZA-japanese-Llama-2-7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class MyNetForClassification(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        self.pretrained = pretrained
        self.config = self.pretrained.config

    def forward(
        self, input_ids, category=None, attention_mask=None,
        output_attentions=None, output_hidden_states=None,
        return_dict=None, inputs_embeds=None, labels=None,
    ):
        outputs = self.pretrained(
            input_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(outputs.logits, category)
        return ModelOutput(
            loss=loss,
            logits=outputs.logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
model = MyNetForClassification(pretrained)

In [None]:
pretrained.device

device(type='cuda', index=0)

In [None]:
peft_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    output_dir="outputs_cls",
    label_names=["category"],
    max_steps=500,
    eval_steps=100,
    logging_steps=100,
    save_steps=100,
    learning_rate=5e-5,
    evaluation_strategy="steps",
    logging_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="title",
    peft_config=peft_config,
)
trainer.train_dataset = trainer.train_dataset.add_column(
    "category", dataset["train"]["category"],
)
trainer.eval_dataset = trainer.eval_dataset.add_column(
    "category", dataset["validation"]["category"],
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()
#trainer.model.save_pretrained("models/lora/" + model_name)
#model = PeftModel.from_pretrained(model, "models/lora/" + model_name)

Step,Training Loss,Validation Loss
100,1.1999,0.588283
200,0.5101,0.434811
300,0.4268,0.353464
400,0.3057,0.297147
500,0.1797,0.300695


TrainOutput(global_step=500, training_loss=0.5244128608703613, metrics={'train_runtime': 285.8844, 'train_samples_per_second': 27.983, 'train_steps_per_second': 1.749, 'total_flos': 0.0, 'train_loss': 0.5244128608703613, 'epoch': 1.3568521031207599})

In [None]:
accuracy(trainer, dataset["validation"])

  0%|          | 0/185 [00:00<?, ?it/s]

0.9131614565849304

In [None]:
embeddings = {}
for key in dataset:
    embeddings[key] = embed(trainer, dataset[key])

  0%|          | 0/1474 [00:00<?, ?it/s]

  0%|          | 0/185 [00:00<?, ?it/s]

  0%|          | 0/184 [00:00<?, ?it/s]

In [None]:
label_pos_tags = ["NOUN", "VERB", "PROPN"]

nlp = spacy.load("ja_core_news_sm")
corpus = {}
for key in dataset:
    corpus[key] = []
    for text in tqdm(dataset[key]["title"]):
        corpus[key].append(" ".join(
            [token.lemma_
             for token in nlp(text) if token.pos_ in label_pos_tags
            ]
        ))

  0%|          | 0/5894 [00:00<?, ?it/s]

  0%|          | 0/737 [00:00<?, ?it/s]

  0%|          | 0/736 [00:00<?, ?it/s]

In [None]:
vectorizer = TfidfVectorizer(min_df=10, lowercase=False)
vectorizer.fit(corpus["train"])
vocab = np.array(vectorizer.get_feature_names_out())
X = {}
for key in dataset:
    X[key] = vectorizer.transform(corpus[key]).toarray()
vocab_embeddings = np.dot((X["train"] / X["train"].sum(0)).T, embeddings["train"])

In [None]:
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=123)
kmeans.fit(embeddings["train"])
centers = kmeans.cluster_centers_

In [None]:
similarities = cosine_similarity(vocab_embeddings, centers)
for i in range(similarities.shape[-1]):
    indices = np.argsort(- similarities[:,i])
    print(" ".join(list(vocab[indices[:20]])))
    print("-"*80)

デジ Mac Ubuntu 機能 容量 使える デバイス クラウド ソフト デザイン 動画 使う バッテリー カメラ PC ipad 専用 ダウンロード IT データ
--------------------------------------------------------------------------------
たち 事情 独女 女子 しまう 結婚 モテる 男性 本音 出会い アリ 運命 男子 効く 職場 合う ホント 女性 もの こと
--------------------------------------------------------------------------------
話題 アップル 発生 原因 インターネット 今度 パソコン 家電 視聴 SNS センター 大丈夫 パナソニック 電子 未来 売れる ソニー テレビ 節電 VS
--------------------------------------------------------------------------------
ニュース 売れる 事故 被害 韓国 影響 炎上 広がる テレビ ツイッター 前田 ネット 名前 激怒 期待 事件 放送 過去 ちゃん コメント
--------------------------------------------------------------------------------
選手 真司 言及 五輪 明かす ファン 代表 サッカー 報道 絶賛 なでしこ チーム 香川 怒る 連発 アナ ロンドン 日本 石井 怒り
--------------------------------------------------------------------------------
Android インチ 予定 向け 開始 プラチナ OS 月額 ソフトバンク イー lte 対応 搭載 wimax 機種 ドコモ スマートフォン tab タブレット 追加
--------------------------------------------------------------------------------
SPORTS watch 星野 斎藤 巨人 岡田 引退 ダルビッシュ ノム 試合 開幕 長友 本田 吉田 監督 野球 