<a href="https://colab.research.google.com/github/tomonari-masada/course2024-nlp/blob/main/04_text_classification_with_LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLMを使ったテキスト分類
* テキスト分類はBoWでも良い性能を出せることが多い。
  * LLMを使って文書分類するときは、BoW+SVMの性能と比較した方が良い。
  * なぜなら、分類性能に大きな差がつかないことも、しばしばあるので。

## インストール
* テキスト埋め込みにSentence Transformersというライブラリを使う。

In [None]:
!pip install -U sentence-transformers datasets

## データセット
* ライブドアニュースコーパスを使う。
  * 前々回と同じ。

In [None]:
from datasets import load_dataset

ds = load_dataset(
    "shunk031/livedoor-news-corpus",
    train_ratio=0.8,
    val_ratio=0.1,
    test_ratio=0.1,
    random_state=42,
    shuffle=True,
    trust_remote_code=True,
)

In [None]:
ds

In [None]:
category_names = [
  'movie-enter',
  'it-life-hack',
  'kaden-channel',
  'topic-news',
  'livedoor-homme',
  'peachy',
  'sports-watch',
  'dokujo-tsushin',
  'smax',
]

In [None]:
ds["train"]["content"][0]

## 埋め込みのための言語モデル
* https://huggingface.co/spaces/mteb/leaderboard

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/multilingual-e5-large-instruct")

* テキストの埋め込み
  * Google Colab無料版で10分弱。
  * iMac Apple M3で18分。

In [None]:
train_embeddings = model.encode(ds["train"]["content"], show_progress_bar=True)

In [None]:
import numpy as np

with open('train_embeddings.npy', 'wb') as f:
  np.save(f, train_embeddings)

In [None]:
validation_embeddings = model.encode(ds["validation"]["content"], show_progress_bar=True)
test_embeddings = model.encode(ds["test"]["content"], show_progress_bar=True)

In [None]:
with open('validation_embeddings.npy', 'wb') as f:
  np.save(f, validation_embeddings)

with open('test_embeddings.npy', 'wb') as f:
  np.save(f, test_embeddings)

In [None]:
with open('train_embeddings.npy', 'rb') as f:
  train_embeddings = np.load(f)
with open('validation_embeddings.npy', 'rb') as f:
  validation_embeddings = np.load(f)
with open('test_embeddings.npy', 'rb') as f:
  test_embeddings = np.load(f)

In [None]:
train_embeddings.shape

## SVMによる分類
* 埋め込みベクトルを使う。

In [None]:
embeddings = np.concatenate([train_embeddings, validation_embeddings])
labels = np.array(ds["train"]["category"] + ds["validation"]["category"])

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

for C in 10. ** np.arange(-1, 4):
  scores = []
  skf_split = skf.split(embeddings, labels)
  for train_index, val_index in skf_split:
    X_train = embeddings[train_index]
    clf = LinearSVC(C=C, dual=False, max_iter=1000, random_state=123)
    clf.fit(X_train, labels[train_index])
    X_val = embeddings[val_index]
    score = clf.score(X_val, labels[val_index])
    print(f"\t{score:.3f}", end=" ")
    scores.append(score)
  print(f"\nmean accuracy: {np.array(scores).mean():.3f}", end="")
  print(f" | C={C:.2e}")

In [None]:
clf = LinearSVC(C=10.0, dual=False, max_iter=1000, random_state=123)
clf.fit(embeddings, labels)
score = clf.score(test_embeddings, ds["test"]["category"])
print(f"{score:.3f}")