In [1]:
datapath = "../data/"
tmppath = "../tmp/02/"
outpath = "./output/"
settingpath = "./setting/"

In [2]:
import torch
import torchtext

print(torch.__version__)  # 1.3.1
print(torchtext.__version__)  # 0.5.0

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

1.13.1+cu117
0.14.1
cuda:0


# 前処理

## データ読み込み

In [3]:
import numpy as np
import pandas as pd

def read_text_file(path):
    with open(path, mode="r") as f:
        result = f.read().splitlines()
    return result

def read_label_file(path):
    result = np.loadtxt(path, dtype='int64')
    return result

def create_df(lablpath, textpath):
    result = pd.DataFrame({'label': read_label_file(lablpath),
                           'text': read_text_file(textpath)})
    return result
    
train_df_origin = create_df(datapath + "label.train.txt", datapath + "text.train.txt")
dev_df_origin = create_df(datapath + "label.dev.txt", datapath + "text.dev.txt")

In [4]:
print(train_df_origin.head())

   label                                               text
0      0                     ぼけっとしてたらこんな時間。チャリあるから食べにでたいのに…
1      1  今日の月も白くて明るい。昨日より雲が少なくてキレイな〜 と立ち止まる帰り道。チャリなし生活も...
2      0                 早寝するつもりが飲み物がなくなりコンビニへ。ん、今日、風が涼しいな。
3      0                                           眠い、眠れない。
4      0    ただいま〜 って新体操してるやん!外食する気満々で家に何もないのに!テレビから離れられない…!


## テキストクリーニング

In [5]:
import collections
from sudachipy import tokenizer
from sudachipy import dictionary

tokenizer_obj = dictionary.Dictionary(dict="full").create()
mode = tokenizer.Tokenizer.SplitMode.C

clear_part_of_speech_list = [["助詞", "助動詞"],["数詞"]]

with open(tmppath + "stopwords.txt") as f:
    stopword_list = f.read().splitlines()

# 出現頻度が少ない単語をstopwordとする
def stopwords_occur(textlist, threshold):
    morphemelist = [tokenizer_obj.tokenize(text, mode) for text in textlist]
    words = []
    for morpheme in morphemelist:
        for word in morpheme:
            words.append(word.normalized_form())
    dic = collections.Counter(words)
    dic = {key:value for key, value in dic.items() if value<= threshold}
    return list(dic.keys())

stopwords_occur = stopwords_occur(train_df_origin["text"], 2)

stopword_list.extend(stopwords_occur)

print(stopword_list[:100])

['。', '、', '.', '為る', '成る', '居る', 'とこ', ':', '/', '_', '-', '〜', '(', ')', '私', '御', '」', '「', '人', '物', 'ー', '言う', 'こと', '見る', '行く', '・', 'さん', 'ちゃん', 'そう', 'よう', ';', '`', '分', '今', '今日', '日', '有る', '又', '来る', '思う', '此の', '時', 'あそこ', 'あたり', 'あちら', 'あっち', 'あと', 'あな', 'あなた', 'あれ', 'いくつ', 'いつ', 'いま', 'いや', 'いろいろ', 'うち', 'おおまか', 'おまえ', 'おれ', 'がい', 'かく', 'かたち', 'かやの', 'から', 'がら', 'きた', 'くせ', 'ここ', 'こっち', 'こと', 'ごと', 'こちら', 'ごっちゃ', 'これ', 'これら', 'ごろ', 'さまざま', 'さらい', 'さん', 'しかた', 'しよう', 'すか', 'ずつ', 'すね', 'すべて', 'ぜんぶ', 'そう', 'そこ', 'そちら', 'そっち', 'そで', 'それ', 'それぞれ', 'それなり', 'たくさん', 'たち', 'たび', 'ため', 'だめ', 'ちゃ']


In [6]:
def text_cleaning(text, mode, clear_part_of_speech_list, stopword_list):
    words = []
    for word in tokenizer_obj.tokenize(text, mode):
        if word.part_of_speech()[0] not in clear_part_of_speech_list[0] and word.part_of_speech()[1] not in clear_part_of_speech_list[1] and word.normalized_form() not in stopword_list:
            words.append(word.normalized_form())
    return " ".join(words)

def df_cleaning(df):
    result_df = df.copy()
    result_df['text'] = df['text'].map(lambda x: text_cleaning(x, mode, clear_part_of_speech_list, stopword_list))
    return result_df

train_df = df_cleaning(train_df_origin)
dev_df = df_cleaning(dev_df_origin)

In [7]:
print(train_df.head())

   label                                            text
0      0                             ぼけっと こんな ちゃり 食べる 出る
1      1  白い 明るい 昨日 雲 少ない 奇麗   立ち止まる 帰り道 ちゃり なし 生活 悪い 無い
2      0                   早寝 積もり 飲み物 なくなる コンビニ んっ 風 涼しい
3      0                                           眠い 眠る
4      0                     只今   ! 外食 満々 無い ! テレビ 離れる !


## 辞書作成

In [8]:
from collections import Counter
from torchtext.vocab import vocab

counter = Counter()
for text in train_df["text"]:
    counter.update(text.split())

voc = vocab(counter, specials=(['<unk>', '<pad>']))
voc.set_default_index(voc['<unk>'])

print(voc.get_itos()[:30])        

['<unk>', '<pad>', 'ぼけっと', 'こんな', 'ちゃり', '食べる', '出る', '白い', '明るい', '昨日', '雲', '少ない', '奇麗', '立ち止まる', '帰り道', 'なし', '生活', '悪い', '無い', '早寝', '積もり', '飲み物', 'なくなる', 'コンビニ', 'んっ', '風', '涼しい', '眠い', '眠る', '只今']


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools

vectorizer = TfidfVectorizer(ngram_range=(1,1))
train_tfidf = vectorizer.fit_transform(train_df["text"])
dev_tfidf = vectorizer.transform(dev_df["text"])

train_tfidf_vec = train_tfidf.toarray()
print(train_tfidf_vec.shape)

dev_tfidf_vec = dev_tfidf.toarray()
print(dev_tfidf_vec.shape)

(30000, 9738)
(2500, 9738)


## Dataloader

In [47]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, df, vecs, transforms) -> None:
        super().__init__()
        self.transforms = transforms
        self.df = df
        self.vecs = vecs
   
    def __getitem__(self, index: int):
        vec = self.vecs[index]
        vec = torch.tensor(vec, dtype=torch.float, device=device)
        label = self.df["label"][index] + 2
        label = torch.tensor(label, dtype=torch.long, device=device)
        return label, vec
    
    def __len__(self) -> int:
        return len(self.df)

In [48]:
from torch.utils.data import DataLoader

BATCH_SIZE = 30


train_dataset = MyDataset(train_df, train_tfidf_vec, my_transform)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

dev_dataset = MyDataset(dev_df, dev_tfidf_vec, my_transform)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [49]:
for i, (vecs, labels) in enumerate(train_loader):
    if i >= 1:
        break
    print(i)
    for vec, label in zip(vecs, labels):
        print(vec, label)

0
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(2, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(3, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(4, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(4, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(2, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(3, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(3, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(4, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(2, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(2, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(0, device='cuda:0')
tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0') tensor(3, device='cuda:0')
te

# モデル定義

In [50]:
import torch.nn as nn
import pytorch_lightning as pl
import math
import torch.nn.functional as F

class SAN(pl.LightningModule):

    # 埋め込み層, 隠れ層, 全結合層の定義 
    def __init__(self, n_tokens, n_embed, n_heads, n_layers, n_output, dropout):
        super(SAN, self).__init__()
        self.n_embed = n_embed
        self.embed = nn.Embedding(num_embeddings=n_tokens, embedding_dim=n_embed, padding_idx=1)
        self.pos_encoder = PositionalEncoding(num_embeddings=n_tokens, embedding_dim=n_embed, dropout=dropout)
        enc_layer = nn.TransformerEncoderLayer(d_model=n_embed, nhead=n_heads, dim_feedforward=n_embed*4, dropout=dropout)
        self.san = nn.TransformerEncoder(encoder_layer=enc_layer, num_layers=n_layers)
        self.fc = nn.Linear(in_features=n_embed, out_features=n_output)
    
    # 順伝播
    def forward(self, x):
        e = self.pos_encoder(self.embed(x) * math.sqrt(self.n_embed))
        o = self.san(e)
        return self.fc(o.mean(dim=0))

    # 訓練用データのバッチを受け取って損失を計算
    def training_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        loss = self.lossfun(y, t)
        self.log("train_loss", loss)
        return loss
    
    # 検証用データのバッチを受け取って損失を計算
    def validation_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        loss = self.lossfun(y, t)
        self.log("val_loss", loss)

    # 評価用データのバッチを受け取って分類の正解率を計算
    def test_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        y = torch.argmax(y, dim=1)
        accuracy = torch.sum(t == y).item() / (len(y) * 1.0)
        self.log("test_acc", accuracy)

    # 損失関数を設定
    def lossfun(self, y, t):
        return nn.MSELoss(y, t)

    # 最適化手法を設定
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.001)


class PositionalEncoding(nn.Module):

    def __init__(self, num_embeddings, embedding_dim, dropout):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe = torch.zeros(num_embeddings, 1, embedding_dim)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

## 学習

In [51]:
# モデルの保存用ディレクトリがすでにあり、新たに訓練する場合は、そのディレクトリを消す

n_tokens = train_tfidf_vec.shape[1]
print(n_tokens)
n_embed = 256
n_heads = 4
n_layers = 4
n_output = 1
dropout = 0.2

model = SAN(n_tokens, n_embed, n_heads, n_layers, n_output, dropout)

# 訓練中にモデルを保存するための設定
checkpoint = pl.callbacks.ModelCheckpoint(
    # 検証用データにおける損失が最も小さいモデルを保存する
    monitor="val_loss", mode="min", save_top_k=1,
    # モデルファイル（重みのみ）を "model" というディレクトリに保存する
    save_weights_only=True, dirpath="model/"
)

torch.set_float32_matmul_precision('high')

# 訓練
trainer = pl.Trainer(gpus=1, max_epochs=20, callbacks=[checkpoint])
trainer.fit(model, train_loader, dev_loader)

# ベストモデルの確認
print("ベストモデル: ", checkpoint.best_model_path)
print("ベストモデルの検証用データにおける損失: ", checkpoint.best_model_score)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type               | Params
---------------------------------------------------
0 | embed       | Embedding          | 2.5 M 
1 | pos_encoder | PositionalEncoding | 0     
2 | san         | TransformerEncoder | 3.2 M 
3 | fc          | Linear             | 257   
---------------------------------------------------
5.7 M     Trainable params
0         Non-trainable params
5.7 M     Total params
22.609    Total estimated model params size (MB)


9738


Sanity Checking: 0it [00:00, ?it/s]

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)