<a href="https://colab.research.google.com/github/tomonari-masada/course2024-nlp/blob/main/02_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# テキスト分類
* BoWでも良い性能を出せることが多い。
  * LLMを使って文書分類するときは、BoW+SVMの性能と比較した方が良い。
  * なぜなら、分類性能に大きな差がつかないことも、しばしばあるので。

## spaCyのインストール

* 最小限のインストール
  * 英語だけ扱えるようになる。

In [None]:
#!pip install -U spacy
!python -m spacy download en_core_web_sm

* spaCyで日本語を扱えるようにする。
  * sudachiという形態素解析器が使えるようになる。

In [None]:
!python -m spacy download ja_core_news_sm

## データセット
* ライブドアニュースコーパスの本文部分を使う。
  * 9値分類。

In [None]:
from datasets import load_dataset

ds = load_dataset(
    "shunk031/livedoor-news-corpus",
    train_ratio=0.8,
    val_ratio=0.1,
    test_ratio=0.1,
    random_state=42,
    shuffle=True,
    trust_remote_code=True,
)

In [None]:
ds

In [None]:
category_names = [
  'movie-enter',
  'it-life-hack',
  'kaden-channel',
  'topic-news',
  'livedoor-homme',
  'peachy',
  'sports-watch',
  'dokujo-tsushin',
  'smax',
]

In [None]:
ds["train"]["content"][0]

## 形態素解析

In [None]:
import spacy

nlp = spacy.load("ja_core_news_sm")
doc = nlp(ds["train"]["content"][0])
for token in doc:
  print(token.lemma_, end=" ")

In [None]:
from tqdm.auto import tqdm

corpus_train = []
for text in tqdm(ds["train"]["content"]):
  doc = nlp(text)
  corpus_train.append(" ".join([token.lemma_ for token in doc]))

In [None]:
with open('livedoor-news-corpus_content_lemmatized.txt', 'w') as f:
  f.write("\n".join(corpus_train) + "\n")

In [None]:
corpus_train = []
with open('livedoor-news-corpus_content_lemmatized.txt', 'r') as f:
  for text in f:
    corpus_train.append(text)

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10, max_df=0.2)
X = vectorizer.fit_transform(corpus_train)

In [None]:
X.shape

In [None]:
corpus_val = []
for text in tqdm(ds["validation"]["content"]):
  doc = nlp(text)
  corpus_val.append(" ".join([token.lemma_ for token in doc]))

corpus_test = []
for text in tqdm(ds["test"]["content"]):
  doc = nlp(text)
  corpus_test.append(" ".join([token.lemma_ for token in doc]))

In [None]:
import numpy as np

corpus = np.array(corpus_train + corpus_val)
len(corpus)
labels = np.array(ds["train"]["category"] + ds["validation"]["category"])

## ハイパーパラメータのチューニング
* SVMの正則化パラメータ`C`
* TfidfVectorizerの`min_df`と`max_df`

### 1

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

for min_df in [10, 20, 30]:
  for max_df in [0.2, 0.3, 0.4]:
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df)
    for C in 10. ** np.arange(-1, 4):
      scores = []
      skf_split = skf.split(corpus, labels)
      for train_index, val_index in skf_split:
        X_train = vectorizer.fit_transform(corpus[train_index])
        clf = LinearSVC(C=C, dual=False, max_iter=1000, random_state=123)
        clf.fit(X_train, labels[train_index])
        X_val = vectorizer.transform(corpus[val_index])
        score = clf.score(X_val, labels[val_index])
        print(f"\t{score:.3f}", end=" ")
        scores.append(score)
      print(f"\nmean accuracy: {np.array(scores).mean():.3f}", end="")
      print(f" | C={C:.2e} min_df={min_df} max_df={max_df:.3f}")

### 2

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

for min_df in [5, 10, 15]:
  for max_df in [0.3, 0.4, 0.5]:
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df)
    for C in 10. ** np.arange(-1, 4):
      scores = []
      skf_split = skf.split(corpus, labels)
      for train_index, val_index in skf_split:
        X_train = vectorizer.fit_transform(corpus[train_index])
        clf = LinearSVC(C=C, dual=False, max_iter=1000, random_state=123)
        clf.fit(X_train, labels[train_index])
        X_val = vectorizer.transform(corpus[val_index])
        score = clf.score(X_val, labels[val_index])
        print(f"\t{score:.3f}", end=" ")
        scores.append(score)
      print(f"\nmean accuracy: {np.array(scores).mean():.3f}", end="")
      print(f" | C={C:.2e} min_df={min_df} max_df={max_df:.3f}")

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

for min_df in [15, 20, 30]:
  for max_df in [0.5, 0.7, 1.0]:
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df)
    for C in [10.0]:
      scores = []
      skf_split = skf.split(corpus, labels)
      for train_index, val_index in skf_split:
        X_train = vectorizer.fit_transform(corpus[train_index])
        clf = LinearSVC(C=C, dual=False, max_iter=1000, random_state=123)
        clf.fit(X_train, labels[train_index])
        X_val = vectorizer.transform(corpus[val_index])
        score = clf.score(X_val, labels[val_index])
        print(f"\t{score:.3f}", end=" ")
        scores.append(score)
      print(f"\nmean accuracy: {np.array(scores).mean():.3f}", end="")
      print(f" | C={C:.2e} min_df={min_df} max_df={max_df:.3f}")

In [None]:
vectorizer = TfidfVectorizer(min_df=20)
X = vectorizer.fit_transform(corpus)
clf = LinearSVC(C=10.0, dual=False, max_iter=1000, random_state=123)
clf.fit(X, labels)
X_test = vectorizer.transform(corpus_test)
score = clf.score(X_test, ds["test"]["category"])
print(f"{score:.3f}")


# 課題
* この分類性能を改良できるかどうか、試行錯誤してみてください。
  * 注意：データセットの分割の仕方は変えないように。