In [10]:
import os
import numpy as np
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
from sklearn.model_selection import train_test_split
import torch
import random
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
from sklearn.preprocessing import LabelEncoder
from torch import nn
import torch.nn.functional as F
from transformers import AutoModel
from transformers import BertModel
os.environ["CUDA_VISIBLE_DEVICES"]='0'
tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
MODEL_NAME='cl-tohoku/bert-base-japanese-whole-word-masking'
batch=1

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [2]:
df_one_month = pd.read_csv('/home/is/shuntaro-o/dev/persons_move_analysis/data/08month.csv')
df_one_day = pd.read_csv('/home/is/shuntaro-o/dev/persons_move_analysis/data/0801.csv')

In [11]:
tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
code_estimate_model_path = '/home/is/shuntaro-o/SharedTask_main/地理コード/Tokyo/model/epoch=2-step=2400000.ckpt'

class BertForSequenceClassifier_pl(pl.LightningModule):
    def __init__(self, model_name, lr, num_class):
        # model_name: Transformersのモデルの名前
        # num_labels: ラベルの数
        # lr: 学習率

        super().__init__()
        # 引数のnum_labelsとlrを保存。
        # 例えば、self.hparams.lrでlrにアクセスできる。
        # チェックポイント作成時にも自動で保存される。
        self.save_hyperparameters()

        # BERTのロード
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_class)
        self.criterion = nn.CrossEntropyLoss()

        # BertLayerモジュールの最後を勾配計算ありに変更
        for param in self.bert.parameters():
            param.requires_grad = False
        for param in self.bert.encoder.layer[-1].parameters():
            param.requires_grad = True

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        preds= self.classifier(output.pooler_output)
        loss = 0
        if labels is not None:
            loss = self.criterion(preds, labels)
        #print(f"tihi is {loss}")
        return loss, preds

    # trainのミニバッチに対して行う処理
    def training_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"])
        self.log('train_loss', loss)
        return {'loss': loss,
                'batch_preds': preds,
                'batch_labels': batch["labels"]}

    # validation、testでもtrain_stepと同じ処理を行う
    def validation_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"])
        return {'loss': loss,
                'batch_preds': preds,
                'batch_labels': batch["labels"]}

    def test_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"])
        return {'loss': loss,
                'batch_preds': preds,
                'batch_labels': batch["labels"]}

    # epoch終了時にvalidationのlossとaccuracyを記録
    def validation_epoch_end(self, outputs, mode="val"):
        # loss計算
        epoch_preds = torch.cat([x['batch_preds'] for x in outputs])
        epoch_labels = torch.cat([x['batch_labels'] for x in outputs])
        epoch_loss = self.criterion(epoch_preds, epoch_labels)
        self.log(f"{mode}_loss", epoch_loss, logger=True)

        num_correct = (epoch_preds.argmax(dim=1) == epoch_labels).sum().item()
        epoch_accuracy = num_correct / len(epoch_labels)
        self.log(f"{mode}_accuracy", epoch_accuracy, logger=True)


    # testデータのlossとaccuracyを算出（validationの使いまわし）
    def test_epoch_end(self, outputs):
        return self.validation_epoch_end(outputs, "test")

    # 学習に用いるオプティマイザを返す関数を書く。
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [19]:
ans_labels = [523871, 523872, 523873, 523874, 523875, 523876, 523877, 523970,
       523971, 523972, 523973, 523974, 523975, 523976, 523977, 533800,
       533802, 533803, 533804, 533805, 533806, 533807, 533811, 533812,
       533813, 533814, 533815, 533816, 533817, 533820, 533821, 533822,
       533823, 533824, 533825, 533826, 533827, 533830, 533831, 533832,
       533833, 533834, 533835, 533836, 533837, 533840, 533841, 533842,
       533843, 533844, 533845, 533846, 533847, 533850, 533851, 533852,
       533853, 533854, 533855, 533856, 533857, 533860, 533861, 533862,
       533863, 533864, 533865, 533866, 533867, 533870, 533871, 533872,
       533873, 533874, 533875, 533876, 533877, 533900, 533901, 533902,
       533903, 533904, 533905, 533906, 533907, 533910, 533911, 533912,
       533913, 533914, 533915, 533916, 533917, 533920, 533921, 533922,
       533923, 533924, 533925, 533926, 533927, 533930, 533931, 533932,
       533933, 533934, 533935, 533936, 533937, 533940, 533941, 533942,
       533943, 533944, 533945, 533946, 533947, 533950, 533951, 533952,
       533953, 533954, 533955, 533956, 533957, 533960, 533961, 533962,
       533963, 533964, 533965, 533966, 533967, 533970, 533971, 533972,
       533973, 533974, 533975, 533976, 533977, 543800, 543801, 543802,
       543803, 543804, 543805, 543806, 543807, 543810, 543811, 543812,
       543813, 543814, 543815, 543816, 543817, 543820, 543821, 543822,
       543823, 543824, 543825, 543826, 543827, 543837, 543900, 543901,
       543902, 543903, 543904, 543905, 543906, 543907, 543910, 543911,
       543912, 543913, 543914, 543915, 543916, 543917, 543920, 543921,
       543922, 543923, 543924, 543925, 543926, 543927, 544010, 544020]
sentences_text_test=df_one_day.text.values
i=0
estimation=[]
model = BertForSequenceClassifier_pl.load_from_checkpoint(code_estimate_model_path)
bert=model.bert.cuda()
classifier=model.classifier.cuda()
for sentence in sentences_text_test:
    text=sentence
    encoding = tokenizer(
    text,
    max_length = 107,           # 文章の長さを固定（Padding/Trancatinating）
    pad_to_max_length = True,# PADDINGで埋める
    truncation=True,
    padding = 'longest',
    return_tensors='pt')
    encoding = { k: v.cuda() for k, v in encoding.items() }
    with torch.no_grad():
        output = bert(**encoding)
        ans=classifier(output.pooler_output)
        ans = ans.to('cpu').detach().numpy().copy()
        ans=np.argmax(ans)
        convert_ans = ans_labels[ans]
        estimation.append(convert_ans)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
estimation=pd.Series(estimation)
df_one_day=pd.concat([df_one_day,estimation],axis=1)
df_one_day=df_one_day.rename(columns={0: 'code_estimation'})

In [27]:
df_one_day.to_csv('/home/is/shuntaro-o/dev/persons_move_analysis/data/08month_ad_estimation.csv')

In [29]:
ans_labels = [523871, 523872, 523873, 523874, 523875, 523876, 523877, 523970,
       523971, 523972, 523973, 523974, 523975, 523976, 523977, 533800,
       533802, 533803, 533804, 533805, 533806, 533807, 533811, 533812,
       533813, 533814, 533815, 533816, 533817, 533820, 533821, 533822,
       533823, 533824, 533825, 533826, 533827, 533830, 533831, 533832,
       533833, 533834, 533835, 533836, 533837, 533840, 533841, 533842,
       533843, 533844, 533845, 533846, 533847, 533850, 533851, 533852,
       533853, 533854, 533855, 533856, 533857, 533860, 533861, 533862,
       533863, 533864, 533865, 533866, 533867, 533870, 533871, 533872,
       533873, 533874, 533875, 533876, 533877, 533900, 533901, 533902,
       533903, 533904, 533905, 533906, 533907, 533910, 533911, 533912,
       533913, 533914, 533915, 533916, 533917, 533920, 533921, 533922,
       533923, 533924, 533925, 533926, 533927, 533930, 533931, 533932,
       533933, 533934, 533935, 533936, 533937, 533940, 533941, 533942,
       533943, 533944, 533945, 533946, 533947, 533950, 533951, 533952,
       533953, 533954, 533955, 533956, 533957, 533960, 533961, 533962,
       533963, 533964, 533965, 533966, 533967, 533970, 533971, 533972,
       533973, 533974, 533975, 533976, 533977, 543800, 543801, 543802,
       543803, 543804, 543805, 543806, 543807, 543810, 543811, 543812,
       543813, 543814, 543815, 543816, 543817, 543820, 543821, 543822,
       543823, 543824, 543825, 543826, 543827, 543837, 543900, 543901,
       543902, 543903, 543904, 543905, 543906, 543907, 543910, 543911,
       543912, 543913, 543914, 543915, 543916, 543917, 543920, 543921,
       543922, 543923, 543924, 543925, 543926, 543927, 544010, 544020]
df_one_month["text"]=df_one_month["text"].astype(str)
sentences_text_test=df_one_month.text.values
i=0
estimation=[]
model = BertForSequenceClassifier_pl.load_from_checkpoint(code_estimate_model_path)
bert=model.bert.cuda()
classifier=model.classifier.cuda()
for sentence in sentences_text_test:
    text=sentence
    encoding = tokenizer(
    text,
    max_length = 107,           # 文章の長さを固定（Padding/Trancatinating）
    pad_to_max_length = True,# PADDINGで埋める
    truncation=True,
    padding = 'longest',
    return_tensors='pt')
    encoding = { k: v.cuda() for k, v in encoding.items() }
    with torch.no_grad():
        output = bert(**encoding)
        ans=classifier(output.pooler_output)
        ans = ans.to('cpu').detach().numpy().copy()
        ans=np.argmax(ans)
        convert_ans = ans_labels[ans]
        estimation.append(convert_ans)

estimation=pd.Series(estimation)
df_one_month=pd.concat([df_one_month,estimation],axis=1)
df_one_month=df_one_month.rename(columns={0: 'code_estimation'})
df_one_month.to_csv('/home/is/shuntaro-o/dev/persons_move_analysis/data/0801_ad_estimation.csv')

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
