In [1]:
import os
import numpy as np
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
from sklearn.model_selection import train_test_split
import torch
import random
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
import pytorch_lightning as pl
from sklearn.preprocessing import LabelEncoder
from torch import nn
import torch.nn.functional as F
from transformers import AutoModel
from transformers import BertModel
os.environ["CUDA_VISIBLE_DEVICES"]='0'
tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
MODEL_NAME='cl-tohoku/bert-base-japanese-whole-word-masking'
batch=1

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [7]:
df_one_month = pd.read_csv('/home/is/shuntaro-o/dev/persons_move_analysis/data/2022101month_per_hour_noGeo.csv',engine='python')

In [2]:
tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
#code_estimate_model_path = '/home/is/shuntaro-o/SharedTask_main/地理コード/Tokyo/model/epoch=2-step=2400000.ckpt'＃

class BertForSequenceClassifier_pl(pl.LightningModule):
    def __init__(self, model_name, lr, num_class):
        # model_name: Transformersのモデルの名前
        # num_labels: ラベルの数
        # lr: 学習率

        super().__init__()
        # 引数のnum_labelsとlrを保存。
        # 例えば、self.hparams.lrでlrにアクセスできる。
        # チェックポイント作成時にも自動で保存される。
        self.save_hyperparameters()

        # BERTのロード
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_class)
        self.criterion = nn.CrossEntropyLoss()

        # BertLayerモジュールの最後を勾配計算ありに変更
        for param in self.bert.parameters():
            param.requires_grad = False
        for param in self.bert.encoder.layer[-1].parameters():
            param.requires_grad = True

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        preds= self.classifier(output.pooler_output)
        loss = 0
        if labels is not None:
            loss = self.criterion(preds, labels)
        #print(f"tihi is {loss}")
        return loss, preds

    # trainのミニバッチに対して行う処理
    def training_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"])
        self.log('train_loss', loss)
        return {'loss': loss,
                'batch_preds': preds,
                'batch_labels': batch["labels"]}

    # validation、testでもtrain_stepと同じ処理を行う
    def validation_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"])
        return {'loss': loss,
                'batch_preds': preds,
                'batch_labels': batch["labels"]}

    def test_step(self, batch, batch_idx):
        loss, preds = self.forward(input_ids=batch["input_ids"],
                                    attention_mask=batch["attention_mask"],
                                    labels=batch["labels"])
        return {'loss': loss,
                'batch_preds': preds,
                'batch_labels': batch["labels"]}

    # epoch終了時にvalidationのlossとaccuracyを記録
    def validation_epoch_end(self, outputs, mode="val"):
        # loss計算
        epoch_preds = torch.cat([x['batch_preds'] for x in outputs])
        epoch_labels = torch.cat([x['batch_labels'] for x in outputs])
        epoch_loss = self.criterion(epoch_preds, epoch_labels)
        self.log(f"{mode}_loss", epoch_loss, logger=True)

        num_correct = (epoch_preds.argmax(dim=1) == epoch_labels).sum().item()
        epoch_accuracy = num_correct / len(epoch_labels)
        self.log(f"{mode}_accuracy", epoch_accuracy, logger=True)


    # testデータのlossとaccuracyを算出（validationの使いまわし）
    def test_epoch_end(self, outputs):
        return self.validation_epoch_end(outputs, "test")

    # 学習に用いるオプティマイザを返す関数を書く。
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [3]:
code_estimate_model_path_japan = "/home/is/shuntaro-o/dev/SharedTask_FlaskApp/Flask/app/models/japan_model.ckpt"
code_estimate_model_path_Tokyo = ("/home/is/shuntaro-o/dev/SharedTask_FlaskApp/Flask/app/models/tokyo.ckpt")

In [12]:
arg2mesh_japan = [
            3927,
            3928,
            3933,
            3942,
            3945,
            4027,
            4028,
            4033,
            4037,
            4040,
            4042,
            4043,
            4128,
            4129,
            4130,
            4133,
            4134,
            4135,
            4140,
            4142,
            4143,
            4228,
            4229,
            4231,
            4233,
            4236,
            4243,
            4330,
            4331,
            4332,
            4340,
            4428,
            4432,
            4442,
            4530,
            4531,
            4532,
            4539,
            4540,
            4541,
            4542,
            4628,
            4630,
            4631,
            4632,
            4639,
            4640,
            4641,
            4729,
            4730,
            4731,
            4732,
            4739,
            4828,
            4829,
            4830,
            4831,
            4832,
            4837,
            4838,
            4841,
            4842,
            4844,
            4845,
            4928,
            4929,
            4930,
            4931,
            4932,
            4933,
            4934,
            4937,
            4938,
            4939,
            4940,
            5029,
            5030,
            5031,
            5032,
            5033,
            5034,
            5035,
            5036,
            5037,
            5039,
            5040,
            5041,
            5042,
            5128,
            5129,
            5130,
            5131,
            5132,
            5133,
            5134,
            5135,
            5136,
            5137,
            5138,
            5139,
            5140,
            5141,
            5142,
            5144,
            5227,
            5228,
            5229,
            5231,
            5232,
            5233,
            5234,
            5235,
            5236,
            5237,
            5238,
            5239,
            5240,
            5241,
            5242,
            5243,
            5244,
            5332,
            5333,
            5334,
            5335,
            5336,
            5337,
            5338,
            5339,
            5340,
            5341,
            5342,
            5434,
            5436,
            5437,
            5438,
            5439,
            5440,
            5441,
            5442,
            5444,
            5445,
            5536,
            5537,
            5538,
            5539,
            5540,
            5541,
            5542,
            5543,
            5544,
            5545,
            5630,
            5636,
            5637,
            5638,
            5639,
            5640,
            5641,
            5642,
            5643,
            5644,
            5645,
            5736,
            5738,
            5739,
            5740,
            5741,
            5742,
            5743,
            5744,
            5745,
            5839,
            5840,
            5841,
            5842,
            5843,
            5844,
            5935,
            5937,
            5939,
            5940,
            5941,
            5942,
            5943,
            5944,
            5945,
            6038,
            6039,
            6040,
            6041,
            6042,
            6043,
            6044,
            6139,
            6140,
            6141,
            6142,
            6143,
            6239,
            6240,
            6241,
            6242,
            6243,
            6244,
            6339,
            6340,
            6341,
            6342,
            6343,
            6344,
            6345,
            6432,
            6437,
            6439,
            6440,
            6441,
            6442,
            6443,
            6444,
            6445,
            6446,
            6537,
            6540,
            6541,
            6542,
            6543,
            6544,
            6545,
            6636,
            6640,
            6641,
            6642,
            6643,
            6644,
            6645,
            6741,
            6742,
            6743,
            6745,
            6830,
            6831,
            6840,
            6841,
            6843,
        ]
df_one_month["text"]=df_one_month["text"].astype(str)
sentences_text_test=df_one_month.text.values
i=0
estimation=[]
model = BertForSequenceClassifier_pl.load_from_checkpoint(code_estimate_model_path_japan)
bert=model.bert.cuda()
classifier=model.classifier.cuda()
for sentence in sentences_text_test:
    text=sentence
    encoding = tokenizer(
    text,
    max_length = 107,           # 文章の長さを固定（Padding/Trancatinating）
    pad_to_max_length = True,# PADDINGで埋める
    truncation=True,
    padding = 'longest',
    return_tensors='pt')
    encoding = { k: v.cuda() for k, v in encoding.items() }
    with torch.no_grad():
        output = bert(**encoding)
        ans=classifier(output.pooler_output)
        ans = ans.to('cpu').detach().numpy().copy()
        ans=np.argmax(ans)
        convert_ans = arg2mesh_japan[ans]
        estimation.append(convert_ans)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
estimation=pd.Series(estimation)
df_one_month=pd.concat([df_one_month,estimation],axis=1)
df_one_month=df_one_month.rename(columns={0: 'code_estimation'})

In [4]:
df_one_month = pd.read_csv('/home/is/shuntaro-o/dev/persons_move_analysis/data/202210noGeo_ad_estimate.csv')

In [5]:
df_one_month

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,created_at,retweets,replies,likes,quote_count,author_id,username,author_followers,author_tweets,author_description,author_location,code_estimation
0,0,0,RT @midotakahaokazu: 10月になりました！サークル参加、お気に入りありが...,2022-10-01_00:59:58,22.0,0.0,0.0,0.0,1137201009669705728,yumehara3,33.0,3627,I'm an artist🇻🇳 and like Kuroko no basket🏀 i ...,,5339
1,1,1,一回だけ30代の主婦のドMさん来たんだけど、次の日には垢消えてた😇,2022-10-01_00:59:58,0.0,0.0,0.0,0.0,433635408,namexlove,56.0,259,30代の男です。DMやカカオで痴態を見せてくれる女の子募集中。年齢、体型は問いません。学生さ...,,5339
2,2,2,みんなからの匿名質問を募集中！こんな質問に答えてるよ● は？● 修行僧になったきっかけは？●...,2022-10-01_00:59:58,0.0,0.0,0.0,0.0,824221871281295360,mattyan_san,173.0,2707,囲碁もやる,家,5339
3,3,3,みんなからの匿名質問を募集中！こんな質問に答えてるよ● 今一番欲しいものは何？● アニメキャ...,2022-10-01_00:59:58,0.0,0.0,0.0,0.0,713935985516490752,hassy6119,1521.0,5676,音ゲーやイベント、カフェやボランティア好きです。よろしくお願いします。音ゲーリザルトや日常な...,,5339
4,4,4,350連で４枚目出たので最後コインで交換して５凸完了です！！２回目の天井いかなくてよかった…...,2022-10-01_00:59:58,0.0,0.0,0.0,0.0,851766242016845824,icho5610,154.0,3425,舞台系のグッズ等の交換譲渡、お取引以外のこともつぶやきます。2.5中心に東宝ミュージカルなど...,,5339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360833,360833,486,RT @anime_toman: ／🔥聖夜決戦編🔥 PV公開！＼新キャラクターたちが続々登...,2022-10-31_23:59:58,46135.0,0.0,0.0,0.0,1497120190588682243,amagi_puka_,1.0,368,一目惚れしました @333akina @02xv_,海洋生物部🐳,5339
360834,360834,487,RT @Tomorrow01: ルウタ結婚式カチコミまとめ①（代替テキストに裏話） #今月描...,2022-10-31_23:59:58,363.0,0.0,0.0,0.0,911937886986964993,Nemo175yugioh,366.0,60833,白猫、DBHやってます。無言フォローすることあるかもしれませんがよろしくです。（バトスタ25...,日本,5339
360835,360835,488,RT @Syunsaeki: 10周年ということで久々に描いてみました！ https://t...,2022-10-31_23:59:58,2563.0,0.0,0.0,0.0,1154241397,eternalfire4s,366.0,276175,漫画、アニメ、ゲーム、絵描き、特撮、食べ歩き、娯楽大好きです‼︎人生は楽しんだモン勝ち‼︎フ...,日本,5339
360836,360836,489,「線は、僕を描く」水墨画に初めて触れたけど、エンドロールも含めて、終始キレイ。葛藤を抱えなが...,2022-10-31_23:59:58,0.0,0.0,5.0,0.0,1372139551410429953,hori_teinei,78.0,143,ていねい通販／社会人4年目／CRM／SNS運用・分析担当／Web解析士／文章力・伝える力を鍛える,,5339


In [14]:
df_Tokyo = df_one_month[df_one_month["code_estimation"]==5339]

In [15]:
df_Tokyo = df_Tokyo.reset_index(drop=True)

In [16]:
df_Tokyo

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,created_at,retweets,replies,likes,quote_count,author_id,username,author_followers,author_tweets,author_description,author_location,code_estimation
0,0,0,RT @midotakahaokazu: 10月になりました！サークル参加、お気に入りありが...,2022-10-01_00:59:58,22.0,0.0,0.0,0.0,1137201009669705728,yumehara3,33.0,3627,I'm an artist🇻🇳 and like Kuroko no basket🏀 i ...,,5339
1,1,1,一回だけ30代の主婦のドMさん来たんだけど、次の日には垢消えてた😇,2022-10-01_00:59:58,0.0,0.0,0.0,0.0,433635408,namexlove,56.0,259,30代の男です。DMやカカオで痴態を見せてくれる女の子募集中。年齢、体型は問いません。学生さ...,,5339
2,2,2,みんなからの匿名質問を募集中！こんな質問に答えてるよ● は？● 修行僧になったきっかけは？●...,2022-10-01_00:59:58,0.0,0.0,0.0,0.0,824221871281295360,mattyan_san,173.0,2707,囲碁もやる,家,5339
3,3,3,みんなからの匿名質問を募集中！こんな質問に答えてるよ● 今一番欲しいものは何？● アニメキャ...,2022-10-01_00:59:58,0.0,0.0,0.0,0.0,713935985516490752,hassy6119,1521.0,5676,音ゲーやイベント、カフェやボランティア好きです。よろしくお願いします。音ゲーリザルトや日常な...,,5339
4,4,4,350連で４枚目出たので最後コインで交換して５凸完了です！！２回目の天井いかなくてよかった…...,2022-10-01_00:59:58,0.0,0.0,0.0,0.0,851766242016845824,icho5610,154.0,3425,舞台系のグッズ等の交換譲渡、お取引以外のこともつぶやきます。2.5中心に東宝ミュージカルなど...,,5339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337800,360833,486,RT @anime_toman: ／🔥聖夜決戦編🔥 PV公開！＼新キャラクターたちが続々登...,2022-10-31_23:59:58,46135.0,0.0,0.0,0.0,1497120190588682243,amagi_puka_,1.0,368,一目惚れしました @333akina @02xv_,海洋生物部🐳,5339
337801,360834,487,RT @Tomorrow01: ルウタ結婚式カチコミまとめ①（代替テキストに裏話） #今月描...,2022-10-31_23:59:58,363.0,0.0,0.0,0.0,911937886986964993,Nemo175yugioh,366.0,60833,白猫、DBHやってます。無言フォローすることあるかもしれませんがよろしくです。（バトスタ25...,日本,5339
337802,360835,488,RT @Syunsaeki: 10周年ということで久々に描いてみました！ https://t...,2022-10-31_23:59:58,2563.0,0.0,0.0,0.0,1154241397,eternalfire4s,366.0,276175,漫画、アニメ、ゲーム、絵描き、特撮、食べ歩き、娯楽大好きです‼︎人生は楽しんだモン勝ち‼︎フ...,日本,5339
337803,360836,489,「線は、僕を描く」水墨画に初めて触れたけど、エンドロールも含めて、終始キレイ。葛藤を抱えなが...,2022-10-31_23:59:58,0.0,0.0,5.0,0.0,1372139551410429953,hori_teinei,78.0,143,ていねい通販／社会人4年目／CRM／SNS運用・分析担当／Web解析士／文章力・伝える力を鍛える,,5339


In [17]:
arg2mesh_Tokyo = [
    523871,
    523872,
    523873,
    523874,
    523875,
    523876,
    523877,
    523970,
    523971,
    523972,
    523973,
    523974,
    523975,
    523976,
    523977,
    533800,
    533802,
    533803,
    533804,
    533805,
    533806,
    533807,
    533811,
    533812,
    533813,
    533814,
    533815,
    533816,
    533817,
    533820,
    533821,
    533822,
    533823,
    533824,
    533825,
    533826,
    533827,
    533830,
    533831,
    533832,
    533833,
    533834,
    533835,
    533836,
    533837,
    533840,
    533841,
    533842,
    533843,
    533844,
    533845,
    533846,
    533847,
    533850,
    533851,
    533852,
    533853,
    533854,
    533855,
    533856,
    533857,
    533860,
    533861,
    533862,
    533863,
    533864,
    533865,
    533866,
    533867,
    533870,
    533871,
    533872,
    533873,
    533874,
    533875,
    533876,
    533877,
    533900,
    533901,
    533902,
    533903,
    533904,
    533905,
    533906,
    533907,
    533910,
    533911,
    533912,
    533913,
    533914,
    533915,
    533916,
    533917,
    533920,
    533921,
    533922,
    533923,
    533924,
    533925,
    533926,
    533927,
    533930,
    533931,
    533932,
    533933,
    533934,
    533935,
    533936,
    533937,
    533940,
    533941,
    533942,
    533943,
    533944,
    533945,
    533946,
    533947,
    533950,
    533951,
    533952,
    533953,
    533954,
    533955,
    533956,
    533957,
    533960,
    533961,
    533962,
    533963,
    533964,
    533965,
    533966,
    533967,
    533970,
    533971,
    533972,
    533973,
    533974,
    533975,
    533976,
    533977,
    543800,
    543801,
    543802,
    543803,
    543804,
    543805,
    543806,
    543807,
    543810,
    543811,
    543812,
    543813,
    543814,
    543815,
    543816,
    543817,
    543820,
    543821,
    543822,
    543823,
    543824,
    543825,
    543826,
    543827,
    543837,
    543900,
    543901,
    543902,
    543903,
    543904,
    543905,
    543906,
    543907,
    543910,
    543911,
    543912,
    543913,
    543914,
    543915,
    543916,
    543917,
    543920,
    543921,
    543922,
    543923,
    543924,
    543925,
    543926,
    543927,
    544010,
    544020,
]
df_Tokyo["text"]=df_Tokyo["text"].astype(str)
sentences_text_test=df_Tokyo.text.values
i=0
estimation=[]
model = BertForSequenceClassifier_pl.load_from_checkpoint(code_estimate_model_path_Tokyo)
bert=model.bert.cuda()
classifier=model.classifier.cuda()
for sentence in sentences_text_test:
    text=sentence
    encoding = tokenizer(
    text,
    max_length = 107,           # 文章の長さを固定（Padding/Trancatinating）
    pad_to_max_length = True,# PADDINGで埋める
    truncation=True,
    padding = 'longest',
    return_tensors='pt')
    encoding = { k: v.cuda() for k, v in encoding.items() }
    with torch.no_grad():
        output = bert(**encoding)
        ans=classifier(output.pooler_output)
        ans = ans.to('cpu').detach().numpy().copy()
        ans=np.argmax(ans)
        convert_ans = arg2mesh_Tokyo[ans]
        estimation.append(convert_ans)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
estimation=pd.Series(estimation)

In [20]:
df=pd.concat([df_Tokyo,estimation],axis=1)

In [22]:
df.to_csv("/home/is/shuntaro-o/dev/persons_move_analysis/data/202210noGeo_ad_estimate_Tokyo.csv")

In [24]:
df_Tokyo = df_Tokyo.reset_index(drop=True)

In [None]:
estimation=pd.Series(estimation)
df_Tokyo=pd.concat([df_Tokyo,estimation],axis=1)
df_Tokyo=df_Tokyo.rename(columns={0: 'code_estimation_Tokyo'})

In [None]:
df_Tokyo.to_csv('/home/is/shuntaro-o/dev/persons_move_analysis/data/202210noGeo_ad_estimate_Tokyo.csv')