#89.BERTによるテキスト分類


##共通部分

In [1]:
#データのDL
import gdown
gdown.download('https://archive.ics.uci.edu/static/public/359/news+aggregator.zip', 'news_dataset.zip', quiet=False)

Downloading...
From: https://archive.ics.uci.edu/static/public/359/news+aggregator.zip
To: /content/news_dataset.zip
29.2MB [00:00, 37.0MB/s]


'news_dataset.zip'

In [2]:
#解凍
!unzip news_dataset.zip

Archive:  news_dataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

!sed -e 's/"/'\''/g' ./newsCorpora.csv > ./newsCorpora_re.csv

# データの読込
df = pd.read_csv('./newsCorpora_re.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# データの抽出
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

# データの分割
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# 事例数の確認
print('【学習データ】')
print(train['CATEGORY'].value_counts())
print('【検証データ】')
print(valid['CATEGORY'].value_counts())
print('【評価データ】')
print(test['CATEGORY'].value_counts())

【学習データ】
b    4501
e    4235
t    1220
m     728
Name: CATEGORY, dtype: int64
【検証データ】
b    563
e    529
t    153
m     91
Name: CATEGORY, dtype: int64
【評価データ】
b    563
e    530
t    152
m     91
Name: CATEGORY, dtype: int64


##本編

In [6]:
#transformersのインストール
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m67.3 MB/s[0m eta [36m0:00:0

In [9]:
#トークナイザーの設定
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
#データセットクラスの作成
import torch
from torch.utils.data import Dataset

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length #最大トークン数を表す変数

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        #テキストをトークン化しエンコード．
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  #先頭と末尾に特殊トークンを追加
            max_length=self.max_length, #最大長を指定
            padding='max_length', #max_lengthの長さにパディング
            truncation=True,  #max_lengthより長い場合切り捨て
            return_tensors='pt' #PyTorchテンソルとして返す
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze() #パディング(水増し)された部分をattentionしないために必要

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(int(label), dtype=torch.long)
        }



In [18]:
# データセットのインスタンス化
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
train_dataset = TextClassificationDataset(train['TITLE'], train['CATEGORY'].map(lambda x: category_dict[x]).values, tokenizer, max_length=128)
valid_dataset = TextClassificationDataset(valid['TITLE'], valid['CATEGORY'].map(lambda x: category_dict[x]).values, tokenizer, max_length=128)
test_dataset = TextClassificationDataset(test['TITLE'], test['CATEGORY'].map(lambda x: category_dict[x]).values, tokenizer, max_length=128)

In [19]:
#データローダの宣言
from torch.utils.data import DataLoader
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
from transformers import BertForSequenceClassification, AdamW

In [20]:
# BERT分類モデルの読み込み
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# GPUの設定
device = torch.device('cuda')
model.to(device)

# オプティマイザーの設定
optimizer = AdamW(model.parameters(), lr=2e-5)

# 学習ループ
epochs = 5
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for batch in train_loader:
      #data_loaderから各データの要素を取り込む
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad() # 勾配をゼロに初期化
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)  #
        loss = outputs.loss #
        train_loss += loss.item() #

        loss.backward()
        optimizer.step()

    # 検証データでの評価
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in valid_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            _, predicted_labels = torch.max(outputs.logits, dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    train_loss /= len(train_loader)
    val_loss /= len(valid_loader)
    val_accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Val Accuracy: {val_accuracy:.4f}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch 1/5 - Train Loss: 0.3645 - Val Loss: 0.2085 - Val Accuracy: 0.9371
Epoch 2/5 - Train Loss: 0.1486 - Val Loss: 0.2062 - Val Accuracy: 0.9334
Epoch 3/5 - Train Loss: 0.0802 - Val Loss: 0.2124 - Val Accuracy: 0.9409
Epoch 4/5 - Train Loss: 0.0511 - Val Loss: 0.2430 - Val Accuracy: 0.9349
Epoch 5/5 - Train Loss: 0.0378 - Val Loss: 0.2390 - Val Accuracy: 0.9394


In [21]:
#評価
model.eval()
test_loss = 0.0
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()

        _, predicted_labels = torch.max(outputs.logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
        total_predictions += labels.size(0)

test_loss /= len(test_loader)
test_accuracy = correct_predictions / total_predictions

print(f"Test Loss: {test_loss:.4f} - Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.1818 - Test Accuracy: 0.9521
