In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 34.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 38.5MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 40.2MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
import os
import codecs
import pandas as pd 
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch.optim as optim
import torch.nn.functional as F

In [None]:
# GPUを使うためのやつ
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# lstm(bach)版
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size, feature_dim,linear_size):
        # 親クラスのコンストラクタ
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        # <pad>の単語IDが0なので,padding_id=0としている
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # batch_first=Trueでバッチサイズx文章の長さxベクトルの次元数になる
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, linear_size)
        self.dropout = nn.Dropout(p=0.2)
        self.hidden2tag = nn.Linear(linear_size + feature_dim, target_size)
        self.softmax = nn.LogSoftmax()

    # 順伝播処理
    def forward(self, sentence, feature):
        # 文章内の各単語をベクトルに変換して出力する
        embeds = self.word_embeddings(sentence)
        _, lstm_out = self.lstm(embeds)
        out1 = self.linear(lstm_out[0])
        out1 = torch.sigmoid(out1)
        # print("out",out1.size())
        # print("feature",feature.view(1,100,3).size())
        # out torch.Size([1, 100, 150])
        # feature torch.Size([1, 100, 3])
        batch_size = feature.size()[0]
        merge = torch.cat([out1,feature.view(1,batch_size,3)],axis=2)
        merge = self.dropout(merge)
        # print("merge",merge.size())\\
        # merge torch.Size([1, 100, 1
        tag_space = self.hidden2tag(merge)
        tag_scores = self.softmax(tag_space.squeeze())

        return tag_scores

In [None]:
# read .tsv
data = pd.read_table("/content/drive/MyDrive/train.tsv/train.tsv")
data1 = pd.read_table("/content/drive/MyDrive/train_rel_2.tsv/train_rel_2.tsv")

data = [data, data1]
data = pd.concat(data)
print(data)
print(type(data))
essay_text = data["EssayText"]
essay_score = data["Score1"]
essay_set = data["EssaySet"] 

# 解答のリスト
texts = [t for t in essay_text]
# 点数のリスト
labels = [l for l in essay_score]

word2index = {}
# 系列をそろえるための文字列を追加
word2index.update({"<pad>":0})
# 単語ID辞書を作成する
for sentence in texts:
    sentence_list = []
    sentence_list = sentence.split()
    for s in sentence_list:
        if s in word2index: continue
        word2index[s] = len(word2index)
print("vocab size: ", len(word2index))


          Id  ...                                          EssayText
0          1  ...  Some additional information that we would need...
1          2  ...  After reading the expirement, I realized that ...
2          3  ...  What you need is more trials, a control set up...
3          4  ...  The student should list what rock is better an...
4          5  ...  For the students to be able to make a replicat...
...      ...  ...                                                ...
17038  27584  ...  white :: white becuase if you live in a hot pl...
17039  27585  ...  light gray :: This color will affect the dogho...
17040  27586  ...  light gray :: i think light gray would work th...
17041  27587  ...  dark gray ::  if they painted the doghouse dar...
17042  27588  ...  white :: the black would affect the doghouse b...

[34250 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>
vocab size:  29404


In [None]:
# 文章を単語IDの列に変換したい
def sentence2index(sentence):
    sentence_list = []
    sentence_list = sentence.split()
    return [word2index[s] for s in sentence_list]


# ピリオドの数を調べる
def count_period(sentence):
    c = 0
    for s in sentence:
        if s == '.':
            c += 1
    return c

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 問題番号のリスト
sets = [s for s in essay_set]
# 解答の単語数のリスト
word_count = [len(sentence2index(s)) for s in essay_text]
# 解答の文の数のリスト
sent_count = [count_period(s) for s in essay_text]

# print(texts[0])
# print(sent_count[0])


datasets = pd.DataFrame(columns=["text","score", "sets", "word_count", "sent_count"])
for i in range(len(texts)):
    s = pd.Series([texts[i], labels[i], sets[i], word_count[i], sent_count[i]], index=datasets.columns)
    datasets = datasets.append(s, ignore_index=True)

# datasets = datasets.sample(frac=1).reset_index(drop=True)
# datasets.head()
print(datasets)

                                                    text  ... sent_count
0      Some additional information that we would need...  ...          1
1      After reading the expirement, I realized that ...  ...          1
2      What you need is more trials, a control set up...  ...          2
3      The student should list what rock is better an...  ...          1
4      For the students to be able to make a replicat...  ...          1
...                                                  ...  ...        ...
34245  white :: white becuase if you live in a hot pl...  ...          1
34246  light gray :: This color will affect the dogho...  ...          3
34247  light gray :: i think light gray would work th...  ...          1
34248  dark gray ::  if they painted the doghouse dar...  ...          1
34249  white :: the black would affect the doghouse b...  ...          3

[34250 rows x 5 columns]


In [None]:
# BERT Tokenizerを用いて単語分割・IDへ変換
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
# tokenizerの確認
print(' Original: ', datasets["text"][0])
print('tokenized: ', tokenizer.tokenize(datasets["text"][0]))
print('tokenIDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(datasets["text"][0])))

 Original:  Some additional information that we would need to replicate the experiment is how much vinegar should be placed in each identical container, how or what tool to use to measure the mass of the four different samples and how much distilled water to use to rinse the four samples after taking them out of the vinegar.
tokenized:  ['some', 'additional', 'information', 'that', 'we', 'would', 'need', 'to', 'replicate', 'the', 'experiment', 'is', 'how', 'much', 'vinegar', 'should', 'be', 'placed', 'in', 'each', 'identical', 'container', ',', 'how', 'or', 'what', 'tool', 'to', 'use', 'to', 'measure', 'the', 'mass', 'of', 'the', 'four', 'different', 'samples', 'and', 'how', 'much', 'di', '##sti', '##lled', 'water', 'to', 'use', 'to', 'ri', '##nse', 'the', 'four', 'samples', 'after', 'taking', 'them', 'out', 'of', 'the', 'vinegar', '.']
tokenIDs:  [2070, 3176, 2592, 2008, 2057, 2052, 2342, 2000, 28024, 1996, 7551, 2003, 2129, 2172, 29387, 2323, 2022, 2872, 1999, 2169, 7235, 11661, 1010

In [None]:
index_dataset_text_tmp = []
index_dataset_score = []
index_dataset_features = []
# 系列の長さの最大値を取得
max_len = []
for sent in datasets["text"]:
    # tokenizerで分割
    token_words = tokenizer.tokenize(sent)
    max_len.append(len(token_words))

# Special token（[CLS], [SEP]）の+2をした値が最大単語数
print('最大単語数: ', max(max_len)+2)

# for text, score, sets, word_count, sent_count in zip(datasets["text"],datasets["score"],datasets["sets"],datasets["word_count"],datasets["sent_count"]):
#     # index_text = sentence2index(text)
#     index_score = [score]
#     index_features = []
#     index_features.append(sets)
#     index_features.append(word_count)
#     index_features.append(sent_count)
#     # index_features = torch.stack([sets, word_count, sent_count], dim=0)
#     index_dataset_text_tmp.append(index_text)
#     index_dataset_score.append(index_score)
#     index_dataset_features.append(index_features)
#     if max_len < len(index_text):
#         max_len = len(index_text)

# 系列をそろえるためのパディング追加
# index_dataset_text = []
# for text in index_dataset_text_tmp:
#     for i in range(max_len - len(text)):
#         text.insert(0,0)
#     index_dataset_text.append(text)

# train_text, test_text, train_score, test_score, train_features, test_features = train_test_split(index_dataset_text, index_dataset_score, index_dataset_features, train_size=0.7)



最大単語数:  385


In [None]:
input_ids = []
attention_masks = []

for sent in datasets["text"]:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens = True,
        max_length = 385,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    # 単語IDを取得
    input_ids.append(encoded_dict['input_ids'])

    # Attention mask の取得
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# tesorに変換
labels = torch.tensor(datasets["score"])
print(labels)
features = []
for i in range(len(datasets["sets"])):
  features_tmp = []
  features_tmp.append(datasets["sets"][i])
  features_tmp.append(datasets["word_count"][i])
  features_tmp.append(datasets["sent_count"][i])
  features.append(features_tmp)


# features_tesor = torch.tensor(features)



# 確認
print('original: ', datasets["text"][0])
print('token IDs :', input_ids[0])
print('features_size: ', features)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tensor([1, 1, 1,  ..., 1, 1, 0])
original:  Some additional information that we would need to replicate the experiment is how much vinegar should be placed in each identical container, how or what tool to use to measure the mass of the four different samples and how much distilled water to use to rinse the four samples after taking them out of the vinegar.
token IDs : tensor([  101,  2070,  3176,  2592,  2008,  2057,  2052,  2342,  2000, 28024,
         1996,  7551,  2003,  2129,  2172, 29387,  2323,  2022,  2872,  1999,
         2169,  7235, 11661,  1010,  2129,  2030,  2054,  6994,  2000,  2224,
         2000,  5468,  1996,  3742,  1997,  1996,  2176,  2367,  8168,  1998,
         2129,  2172,  4487, 16643, 11001,  2300,  2000,  2224,  2000, 15544,
        12325,  1996,  2176,  8168,  2044,  2635,  2068,  2041,  1997,  1996,
        29387,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
     

In [None]:
print(input_ids.shape)

torch.Size([34250, 385])


In [None]:
# データをバッチでまとめる
def train2batch(text, score, features, batch_size=8):
    text_batch = []
    score_batch = []
    features_batch = []
    text_shuffle, score_shuffle, features_shuffle = shuffle(text, score, features)
    for i in range(0, len(text), batch_size):
        text_batch.append(text_shuffle[i:i+batch_size])
        score_batch.append(score_shuffle[i:i+batch_size])
        features_batch.append(features_shuffle[i:i+batch_size])
    return text_batch, score_batch, features_batch


def score2tensor(score):
    return torch.tensor([score], dtype=torch.long)

In [None]:
from transformers import BertModel
bert_model = BertModel.from_pretrained("bert-base-uncased")

bert_model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
input = input_ids[0].unsqueeze(0).to(device)
outputs = bert_model(input)
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states.shape )

torch.Size([1, 385, 768])


In [None]:
# lstm(bach)版
class BERT_LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, target_size, feature_dim,linear_size):
        # 親クラスのコンストラクタ
        super( BERT_LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        # embeddingをbertにする
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
        # batch_first=Trueでバッチサイズx文章の長さxベクトルの次元数になる
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        # self.linear = nn.Linear(hidden_dim, linear_size)
        self.linear = nn.Linear(768, linear_size)
        self.dropout = nn.Dropout(p=0.2)
        self.hidden2tag = nn.Linear(linear_size + feature_dim, target_size)
        self.softmax = nn.LogSoftmax()

    # 順伝播処理
    def forward(self, sentence, feature):
        # 文章内の各単語をベクトルに変換して出力する
        embeds = self.bert_model(sentence)
        bert_out = embeds.last_hidden_state
        bert_out = bert_out[:,0,:]
        
        # _, lstm_out = self.lstm(bert_out)
        # out1 = self.linear(lstm_out[0])
        bert_out = bert_out.view(-1, 768)

        out1 = self.linear(bert_out)
        out1 = torch.sigmoid(out1)
        # print("feature",feature.view(1,100,3).size())
        # out torch.Size([1, 100, 150])
        # feature torch.Size([1, 100, 3])
        batch_size = feature.size()[0]
        # merge = torch.cat([out1,feature.view(1,batch_size,3)],axis=2)
        merge = torch.cat([out1,feature],axis=1)
        merge = self.dropout(merge)
        # print("merge",merge.size())\\
        # merge torch.Size([1, 100, 1
        tag_space = self.hidden2tag(merge)
        tag_scores = self.softmax(tag_space.squeeze())

        return tag_scores

In [None]:
# 全単語数の取得
VOCAB_SIZE = len(word2index)
# 単語ベクトルの次元数
EMBEDDING_DIM = 768
# 隠れ層の次元数
HIDDEN_DIM = 128
# 点数の数
SCORE_SIZE = 4
FEATURE_DIM = 3
LINEARE_SIZE = 150

# to(device)でGPU対応させる
# model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, SCORE_SIZE, FEATURE_DIM, LINEARE_SIZE).to(device)
model =  BERT_LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, SCORE_SIZE, FEATURE_DIM, LINEARE_SIZE).to(device)
# # データを7:3に分ける
# traindata, testdata = train_test_split(datasets, train_size=0.7)

# # 損失関数
# loss_function = nn.NLLLoss()
# # 最適化
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# losses = []

In [None]:
# ファインチューニングの設定
# 勾配計算を最後のBertLayerモジュールと追加した分類アダプターのみ実行

# まずは全部OFF
for param in model.parameters():
    param.requires_grad = False

# BERTの最後の層だけ更新ON
for param in model.bert_model.encoder.layer[-1].parameters():
    param.requires_grad = True

# クラス分類のところもON
for param in model.linear.parameters():
    param.requires_grad = True

for param in model.hidden2tag.parameters():
    param.requires_grad = True
# import torch.optim as optim

# 事前学習済の箇所は学習率小さめ、最後の全結合層は大きめにする。
optimizer = optim.Adam([
    {'params': model.bert_model.encoder.layer[-1].parameters(), 'lr': 5e-5},
    {'params': model.linear.parameters(), 'lr': 1e-4},
    {'params': model.hidden2tag.parameters(), 'lr': 1e-4}
])

# 損失関数の設定
loss_function = nn.NLLLoss()

In [None]:
# from torch.utils.data import TensorDataset, random_split
# from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# dataset = TensorDataset(input_ids, features, labels)

In [None]:
 train_text, test_text, train_score, test_score, train_features, test_features = train_test_split(input_ids, labels, features, train_size=0.7)

In [None]:
for epoch in range(8):
    all_loss = 0
    text_batch, score_batch, features_batch = train2batch(train_text, train_score, train_features)

    acu = 0
    for i in range(len(text_batch)):
        bach_loss = 0

        model.zero_grad()
        # 順伝播させるtensorはGPUに処理させるのでGPUにセットする
        # text_tensor = torch.tensor(text_batch[i], device=device)
        # score_tensor = torch.tensor(score_batch[i], device=device)
        features_tensor = torch.tensor(features_batch[i], device=device)
        # text_tensor = text_batch[i].to(device)
        text_tensor = text_batch[i].to(device)
        score_tensor = score_batch[i].to(device)
        # # all_loss += batch_loss.item()
        out = model(text_tensor,features_tensor)

        batch_loss = loss_function(out, score_tensor.squeeze())
        batch_loss.backward()
        optimizer.step()
        all_loss += batch_loss.item()
        _,predict = torch.max(out,1)
        for j, ans in enumerate(score_tensor):
            if predict[j].item() == ans.item():
                acu += 1
    print("epoch", epoch, "\t", "loss", all_loss, "\t","accuracy",acu/len(train_text))
    if all_loss < 0.1: break
print("done.")



epoch 0 	 loss 1560.7358588054776 	 accuracy 0.7906152241918665
epoch 1 	 loss 1108.3287073951215 	 accuracy 0.8586861313868613
epoch 2 	 loss 731.421497638803 	 accuracy 0.9137017726798748
epoch 3 	 loss 490.55454625049606 	 accuracy 0.941689259645464
epoch 4 	 loss 389.16306690324564 	 accuracy 0.9534932221063608
epoch 5 	 loss 315.5520096689579 	 accuracy 0.9617518248175182
epoch 6 	 loss 282.0930710042012 	 accuracy 0.9641710114702815
epoch 7 	 loss 261.60721901105717 	 accuracy 0.965881126173097
done.


In [None]:
test_num = len(test_text)
a = 0
with torch.no_grad():
    text_batch, score_batch, features_batch = train2batch(test_text, test_score, test_features)
    for i in range(len(text_batch)):
        # text_tensor = torch.tensor(text_batch[i], device=device)
        text_tensor = text_batch[i].to(device)
        score_tensor = score_batch[i].to(device)
        # score_tensor = torch.tensor(score_batch[i], device=device)
        features_tesor = torch.tensor(features_batch[i],device=device)

        out = model(text_tensor,features_tesor)
        _, predicts = torch.max(out, 1)
        for j ,ans in enumerate(score_tensor):
            if predicts[j].item() == ans.item():
                a += 1
print("predict:", a/test_num)



predict: 0.8654014598540146
