In [85]:
datapath = "../data/"
tmppath = "../tmp/02/"
outpath = "./output/"
settingpath = "./setting/"

import warnings
warnings.simplefilter('ignore')

In [86]:
import torch
import torchtext

print(torch.__version__)  # 1.3.1
print(torchtext.__version__)  # 0.5.0

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

1.13.1+cu117
0.14.1
cuda:0


# 前処理

## データ読み込み

In [87]:
import numpy as np
import pandas as pd

def read_text_file(path):
    with open(path, mode="r") as f:
        result = f.read().splitlines()
    return result

def read_label_file(path):
    result = np.loadtxt(path, dtype='int64')
    return result

def create_df(textpath, lablpath):
    result = pd.DataFrame({'text': read_text_file(textpath),
                           'label': read_label_file(lablpath)})
    return result
    
train_df_origin = create_df(datapath + "text.train.txt", datapath + "label.train.txt")
dev_df_origin = create_df(datapath + "text.dev.txt", datapath + "label.dev.txt")

In [88]:
print(train_df_origin.head())

                                                text  label
0                     ぼけっとしてたらこんな時間。チャリあるから食べにでたいのに…      0
1  今日の月も白くて明るい。昨日より雲が少なくてキレイな〜 と立ち止まる帰り道。チャリなし生活も...      1
2                 早寝するつもりが飲み物がなくなりコンビニへ。ん、今日、風が涼しいな。      0
3                                           眠い、眠れない。      0
4    ただいま〜 って新体操してるやん!外食する気満々で家に何もないのに!テレビから離れられない…!      0


## テキストクリーニング

In [89]:
import collections
from sudachipy import tokenizer
from sudachipy import dictionary

tokenizer_obj = dictionary.Dictionary(dict="full").create()
mode = tokenizer.Tokenizer.SplitMode.C

clear_part_of_speech_list = [["助詞", "助動詞"],["数詞"]]

with open(tmppath + "stopwords.txt") as f:
    stopword_list = f.read().splitlines()

# 出現頻度が少ない単語をstopwordとする
def stopwords_occur(textlist, threshold):
    morphemelist = [tokenizer_obj.tokenize(text, mode) for text in textlist]
    words = []
    for morpheme in morphemelist:
        for word in morpheme:
            words.append(word.normalized_form())
    dic = collections.Counter(words)
    dic = {key:value for key, value in dic.items() if value<= threshold}
    return list(dic.keys())

stopwords_occur = stopwords_occur(train_df_origin["text"], 2)

stopword_list.extend(stopwords_occur)

print(stopword_list[:100])

['。', '、', '.', '為る', '成る', '居る', 'とこ', ':', '/', '_', '-', '〜', '(', ')', '私', '御', '」', '「', '人', '物', 'ー', '言う', 'こと', '見る', '行く', '・', 'さん', 'ちゃん', 'そう', 'よう', ';', '`', '分', '今', '今日', '日', '有る', '又', '来る', '思う', '此の', '時', 'あそこ', 'あたり', 'あちら', 'あっち', 'あと', 'あな', 'あなた', 'あれ', 'いくつ', 'いつ', 'いま', 'いや', 'いろいろ', 'うち', 'おおまか', 'おまえ', 'おれ', 'がい', 'かく', 'かたち', 'かやの', 'から', 'がら', 'きた', 'くせ', 'ここ', 'こっち', 'こと', 'ごと', 'こちら', 'ごっちゃ', 'これ', 'これら', 'ごろ', 'さまざま', 'さらい', 'さん', 'しかた', 'しよう', 'すか', 'ずつ', 'すね', 'すべて', 'ぜんぶ', 'そう', 'そこ', 'そちら', 'そっち', 'そで', 'それ', 'それぞれ', 'それなり', 'たくさん', 'たち', 'たび', 'ため', 'だめ', 'ちゃ']


In [90]:
def text_cleaning(text, mode, clear_part_of_speech_list, stopword_list):
    words = []
    for word in tokenizer_obj.tokenize(text, mode):
        if word.part_of_speech()[0] not in clear_part_of_speech_list[0] and word.part_of_speech()[1] not in clear_part_of_speech_list[1] and word.normalized_form() not in stopword_list:
            words.append(word.normalized_form())
    return " ".join(words)

def df_cleaning(df):
    result_df = df.copy()
    result_df['text'] = df['text'].map(lambda x: text_cleaning(x, mode, clear_part_of_speech_list, stopword_list))
    return result_df

train_df = df_cleaning(train_df_origin)
dev_df = df_cleaning(dev_df_origin)

In [91]:
print(train_df.head())

                                             text  label
0                             ぼけっと こんな ちゃり 食べる 出る      0
1  白い 明るい 昨日 雲 少ない 奇麗   立ち止まる 帰り道 ちゃり なし 生活 悪い 無い      1
2                   早寝 積もり 飲み物 なくなる コンビニ んっ 風 涼しい      0
3                                           眠い 眠る      0
4                     只今   ! 外食 満々 無い ! テレビ 離れる !      0


## 辞書作成

In [92]:
from collections import Counter
from torchtext.vocab import vocab

counter = Counter()
for text in train_df["text"]:
    counter.update(text.split())

voc = vocab(counter, specials=(['<unk>', '<pad>']))
voc.set_default_index(voc['<unk>'])

print(voc.get_itos()[:30]) 

['<unk>', '<pad>', 'ぼけっと', 'こんな', 'ちゃり', '食べる', '出る', '白い', '明るい', '昨日', '雲', '少ない', '奇麗', '立ち止まる', '帰り道', 'なし', '生活', '悪い', '無い', '早寝', '積もり', '飲み物', 'なくなる', 'コンビニ', 'んっ', '風', '涼しい', '眠い', '眠る', '只今']


## Dataloader

In [93]:
import torchtext.transforms as T

text_transform = T.Sequential(
    T.VocabTransform(voc),
    T.ToTensor(padding_value=voc['<pad>'])
)

def collate_batch(batch):
    texts = text_transform([text.split() for (text, label) in batch])
    texts = torch.t(texts)
    labels = torch.tensor([label+2 for (text, label) in batch], dtype=torch.long)
    return texts, labels

In [94]:
from torch.utils.data import DataLoader

batch_size = 128

train_loader = DataLoader(train_df.values, batch_size=batch_size, shuffle=True, collate_fn=collate_batch, num_workers=4)
dev_loader = DataLoader(dev_df.values, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=4)

In [95]:
batch = next(iter(train_loader))
x, t = batch
print(t.shape)
print(x.shape)

torch.Size([128])
torch.Size([32, 128])


# モデル定義

In [99]:
import torch.nn as nn
import pytorch_lightning as pl
import math
import torch.nn.functional as F

class SAN(pl.LightningModule):

    # 埋め込み層, 隠れ層, 全結合層の定義 
    def __init__(self, n_tokens, n_embed, n_heads, n_layers, n_output, dropout):
        super(SAN, self).__init__()
        self.n_embed = n_embed
        self.embed = nn.Embedding(num_embeddings=n_tokens, embedding_dim=n_embed, padding_idx=voc['<pad>'])
        self.pos_encoder = PositionalEncoding(num_embeddings=n_tokens, embedding_dim=n_embed, dropout=dropout)
        enc_layer = nn.TransformerEncoderLayer(d_model=n_embed, nhead=n_heads, dim_feedforward=n_embed*4, dropout=dropout)
        self.san = nn.TransformerEncoder(encoder_layer=enc_layer, num_layers=n_layers)
        self.fc = nn.Linear(in_features=n_embed, out_features=n_output)
    
    # 順伝播
    def forward(self, x):
        e = self.pos_encoder(self.embed(x) * math.sqrt(self.n_embed))
        o = self.san(e)
        return self.fc(o.mean(dim=0))

    # 訓練用データのバッチを受け取って損失を計算
    def training_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        loss = self.lossfun(y, t)
        self.log("train_loss", loss)
        return loss
    
    # 検証用データのバッチを受け取って損失を計算
    def validation_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        loss = self.lossfun(y, t)
        self.log("val_loss", loss)

    # 評価用データのバッチを受け取って分類の正解率を計算
    def test_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        y = torch.argmax(y, dim=1)
        accuracy = torch.sum(t == y).item() / (len(y) * 1.0)
        self.log("test_acc", accuracy)

    # 損失関数を設定
    def lossfun(self, y, t):
        return F.cross_entropy(y, t)

    # 最適化手法を設定
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.01)


class PositionalEncoding(nn.Module):

    def __init__(self, num_embeddings, embedding_dim, dropout):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe = torch.zeros(num_embeddings, 1, embedding_dim)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

## 学習

In [100]:
n_tokens = len(voc)
print(n_tokens)
n_embed = 256
n_heads = 4
n_layers = 4
n_output = 5
dropout = 0.2

model = SAN(n_tokens, n_embed, n_heads, n_layers, n_output, dropout)

print(model)

10670
SAN(
  (embed): Embedding(10670, 256, padding_idx=1)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (san): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bi

In [101]:
# 訓練中にモデルを保存するための設定
checkpoint = pl.callbacks.ModelCheckpoint(
    # 検証用データにおける損失が最も小さいモデルを保存する
    monitor="val_loss", mode="min", save_top_k=1,
    # モデルファイル（重みのみ）を "model" というディレクトリに保存する
    save_weights_only=True, dirpath="model/"
)

torch.set_float32_matmul_precision('high')

# 訓練
trainer = pl.Trainer(accelerator='gpu', devices=1, max_epochs=40, callbacks=[checkpoint])
trainer.fit(model, train_loader, dev_loader)

# ベストモデルの確認
print("ベストモデル: ", checkpoint.best_model_path)
print("ベストモデルの検証用データにおける損失: ", checkpoint.best_model_score)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type               | Params
---------------------------------------------------
0 | embed       | Embedding          | 2.7 M 
1 | pos_encoder | PositionalEncoding | 0     
2 | san         | TransformerEncoder | 3.2 M 
3 | fc          | Linear             | 1.3 K 
---------------------------------------------------
5.9 M     Trainable params
0         Non-trainable params
5.9 M     Total params
23.567    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

ベストモデル:  /workspace/DockerML_sandbox/lab_competition/02/model/epoch=4-step=1175.ckpt
ベストモデルの検証用データにおける損失:  tensor(1.5349, device='cuda:0')
