In [1]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import os
import shutil
from scripts import ebay
import argparse
import torch
import numpy as np
import pandas as pd
import time
import re
from PIL import Image
import swifter
import glob
import mmbt.train_sales_forecast as train
from mmbt.data import *
from mmbt.data.helpers import get_data_loaders
from mmbt.data.helpers import get_data_loaders_for_production
from sklearn.preprocessing import LabelEncoder
#最大表示列数の指定（ここでは50列を指定）
pd.set_option('display.max_columns', 55)

  from tqdm.autonotebook import tqdm


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


# 関数定義

In [2]:
# 開けない画像がないか調査
def error_imagefile_serch(fullpath):
    path = './datasets/'+ fullpath
    try:
        image = Image.open(path)
        return 0
    except:
        return 1
    
# pickleを保存
def save_pickle(obj, path):
    with open(path, mode='wb') as f:
        pickle.dump(obj, f)

# pickleをロード
def load_pickle(path):
    with open(path, mode='rb') as f:
        obj = pickle.load(f)
    return obj

# 学習データ用の前処理
def get_preprocessed_data(df,target_col):
    
    
    # 前処理
    df_tmp = df
    # カテゴリー分類の際は、IDを指定する
    if target_col == 'ctg':
        df_tmp.ctg = df_tmp.ctg.astype(str)
#             df_tmp = df_tmp[df_tmp['ctg'].isin(['69528','183454','180506','169291','15687'])]
        df_tmp = df_tmp[df_tmp['ctg'].isin(['45258','15709','31387','52357'])]
        
    df_tmp["fullpath"] = df_tmp.itemid.apply(lambda x: "images/" + str(x) + ".jpg")

    df_tmp['error_imgfile_flg'] = df_tmp['fullpath'].swifter.apply(lambda x:error_imagefile_serch(x))

    df_tmp = df_tmp[df_tmp['error_imgfile_flg']==0]
    df_tmp["label"] = df_tmp[target_col]
    df_tmp['img'] = './datasets/' + df_tmp['fullpath']
    df_tmp["text"] = df_tmp['title'].str.lower()
    df_tmp["text"] = df_tmp['text'].apply(lambda x: " ".join(re.findall(r"[ぁ-んァ-ン一-龥ー'\da-zA-Z\-]+", x)))


    df_preprocessed = df_tmp.loc[:,['ctg_name','label','text','img']].reset_index(drop=True)
    df_preprocessed['key']=df_preprocessed.index

        

    
    return df_preprocessed



# 全ての画像ファイルのパスを取得
def get_image_path_list(df):
    df['img'] = './datasets/'+target_dataset+'/' + df['fullpath']
    df['img_folda'] = df['img'].apply(lambda x:os.path.split(x)[0])
    df['img_file_path_list'] = df['img_folda'].apply(lambda x:(glob.glob(x)))
    return df

# train,val,testデータを取得
def get_dataset(df_preprocessed):
    # 学習データとテストデータに分ける
    df_train_dev,df_test = train_test_split(df_preprocessed, random_state=0,stratify=df_preprocessed.label)
    # 最小データ数でダウンサンプリング
    minimum_num = df_train_dev['label'].value_counts().min()
    # ラベル毎に最小データ数だけサンプリング
    dfs = [d.sample(minimum_num, random_state=0) for name, d in df_train_dev.groupby('label')]
    # 結合。ラベル順に並んでいるのでshuffleする
    under_resampled_df = pd.concat(dfs).sample(frac=1, random_state=0)
    df_train_dev = under_resampled_df
    df_train, df_dev = train_test_split(df_train_dev,stratify=df_train_dev.label)
    data_dict = {"train": df_train, "val": df_dev,'test':df_test}

    return data_dict


# aucを取得
def get_roc_auc_score(detail_list):
    y = detail_list[2]
    pred = detail_list[3]
    pred_0 = [x[1] for x in pred]
    cleaned_pred_0 = [0 if str(x) == 'nan' else x for x in pred_0]
    return roc_auc_score(y, cleaned_pred_0)

# モデルのメタデータと評価をdfにまとめる
def get_df_result(classification_report):
    df_classification_report= pd.DataFrame(classification_report).transpose()
    acc = df_classification_report.loc['accuracy']['support']
    macro_f1 = df_classification_report.loc['macro avg']['f1-score']
    model_name = 'model'+save_model_name+'.pt'
    result_dict={'model_name':model_name,'target_col':target_col,
                'batch_sz':batch_sz,'macro_f1':macro_f1,'accuracy':acc}
    df_result = pd.DataFrame.from_dict(result_dict, orient='index').T
    return df_result

def save_classification_report(classification_report):
    classification_report_dir = './classification_report/'
    result_dir = './result/'
    if not os.path.exists(classification_report_dir):
        print("ディレクトリを作成します")
        os.makedirs(classification_report_dir)
    # classification_reportの保存
    save_pickle(classification_report,classification_report_dir+'/report_'+save_model_name+'.pkl')

In [4]:
# 本番データでの予測の際に使用する関数
# 予測値の確率を出力
def get_preds_score(df,model_name):
    target_dataset = 'ebay_20210822_msg_prediction' # データセット名
    model = 'mmbt'
    bert_model = 'bert-base-uncased' # bertモデル　bert-large-uncased　bert-base-uncased
    image_model = 'resnet152' # imageモデル
    batch_sz = '32' # 不要
    max_epochs = '10' # 不要
    target_col = 'none' # 不要
    

    # データ読み込み
    df = df.loc[:][['mk_title','url','selling_price','category_id']]
    df['title'] = df['mk_title']

    # 前処理
    categoryid = 'category_id'
    price = 'selling_price'
    df_preprocessed = get_preprocessed_data_for_production(df,target_dataset,categoryid,price)
    # データセット作成＆保存
    data_dict = get_dataset_for_production(df_preprocessed)
    # モデルの各種設定
    args = train.get_args(target_col,bert_model,batch_sz,max_epochs,model)
    dataloaders_dict = get_data_loaders_for_production(args,data_dict)

    # 評価
    preds_score = train.production(args,model_name,dataloaders_dict,target_dataset)
    df_preprocessed['preds_score'] = preds_score
    df_preprocessed = df_preprocessed.reset_index(drop=True)
    df_preprocessed
    return df_preprocessed

def get_df_addcol(df):
    df_tmp = df
    df_tmp["fullpath"] = df_tmp.url.apply(lambda x: "images/" + x.split("/")[-2] + ".jpg")
    df_tmp['error_imgfile_flg'] = df_tmp['fullpath'].swifter.apply(lambda x:error_imagefile_serch(x,target_dataset))
    df_tmp['img'] = './datasets/'+target_dataset+'/' + df_tmp['fullpath']
    return df_tmp


def get_first_pred_score(x):
    try:
        code_regex = re.compile('[\'[\]]')
        cleaned_text = code_regex.sub('', x).split()[1]
        return cleaned_text
    except:
        return x
    
def get_pred(x):
    try:
        code_regex = re.compile('[\'[\]]')
        cleaned_x_list = code_regex.sub('', x).split()
        argmax_index = cleaned_x_list.index(max(cleaned_x_list))
        return argmax_index
    except:
        return x

In [3]:
def train_and_report(model):
    args = train.get_args(target_col,bert_model,batch_sz,max_epochs,model,n_classes)
    dataloaders_dict = get_data_loaders(args,data_dict)

    # 学習
    train.train(args,dataloaders_dict)

    # モデルを名前を付けて保存
    shutil.copyfile('./savedir/mmbt_model_run/model_best.pt','./savedir/mmbt_model_run/'+save_model_name+'.pt')

    # 評価
    classification_report,confusion_matrix,detail_list,attention_probs = train.test(args,save_model_name,dataloaders_dict)
    # print(pd.DataFrame(classification_report).transpose())
    # 保存
    save_classification_report(classification_report)

    # 結果を保存
    df_result = get_df_result(classification_report)
    result_dir = './result/'
    save_pickle(df_result,result_dir+'df_result_'+save_model_name+'.pkl')

In [5]:
# カテゴリー番号、カテゴリー名のユニークなテーブルを作成
df_sub = pd.read_table('./datasets/ebay_codes_20210815_message_prediction/data.tsv')
df_uniques = df_sub[["ctg", "ctg_name"]].drop_duplicates() 
# df_uniques.ctg = df_uniques.ctg.astype(str)


# 前処理
df = pd.read_table('./datasets/data.tsv')
df_tmp = df
df_tmp["fullpath"] = df_tmp.img.apply(lambda x: "images/" + str(x))
df_tmp = df_tmp[df_tmp['imgflg'] == True]

# 画像ファイルが存在する行に絞る
# df_tmp['error_imgfile_flg'] = df_tmp['fullpath'].swifter.apply(lambda x:error_imagefile_serch(x))
# df_tmp = df_tmp[df_tmp['error_imgfile_flg']==0]

df_tmp["label"] = df_tmp.category_id
df_tmp['img'] = './datasets/' + df_tmp['fullpath']
df_tmp["text"] = df_tmp['title'].str.lower()
df_tmp["text"] = df_tmp['text'].apply(lambda x: " ".join(re.findall(r"[ぁ-んァ-ン一-龥ー'\da-zA-Z\-]+", x)))
df_tmp = pd.merge(df_tmp,df_uniques,left_on='label',right_on='ctg',how='left')


# カテゴリー数上位10に絞る
df_category_cnt = df_tmp.category_id.value_counts().reset_index()
df_category_cnt_top10 = df_category_cnt.iloc[0:10]
df_category_cnt_top10 = df_category_cnt_top10.drop(columns='category_id')

# カテゴリー数1000件以上に絞る
# df_category_cnt_over1000 = df_tmp.category_id.value_counts().reset_index()
# df_category_cnt_over1000 = df_category_cnt_over1000[df_category_cnt_over1000['category_id']>1000]
# df_category_cnt_over1000 = df_category_cnt_over1000.drop(columns='category_id')

# カテゴリー名を結合
df_tmp = pd.merge(df_category_cnt_top10,df_tmp,left_on='index',right_on='category_id',how='left')
# df_tmp = pd.merge(df_category_cnt_over1000,df_tmp,left_on='index',right_on='category_id',how='left')

df_preprocessed = df_tmp.loc[:,['ctg_name','label','text','img']].reset_index(drop=True)
df_preprocessed['key'] = df_preprocessed.index
df_preprocessed.label= df_preprocessed.label.astype(str)
df_preprocessed = df_preprocessed.dropna(how='any')
df_preprocessed.to_csv('./datasets/data_preprocessed.csv')
df_preprocessed.isnull().any()

  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp["label"] = df_tmp.category_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp['img'] = './datasets/' + df_tmp['fullpath']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tmp["text"] = df_tmp['title'].str.lower()
A value is trying to

ctg_name    False
label       False
text        False
img         False
key         False
dtype: bool

# 実行

In [None]:
# 変数設定
target_col = 'ctg' # 目的変数のカラム名
model = 'bert'# mmbt/img/bert/concatbert
bert_model = 'bert-base-uncased' # bertモデル　bert-large-uncased　bert-base-uncased
image_model = 'resnet152' # imageモデル ※将来的にresnet152以外のモデルを使う場合に使用
batch_sz = '4' # 32 バッチサイズ
max_epochs = '20' # 学習回数
n_classes = '10'

# データセット作成＆保存
df_preprocessed = pd.read_csv('./datasets/data_preprocessed.csv', index_col=0,dtype = {'ctg_name':'object', 'label':'object', 'text':'object','img':'object','key':'int'})
df_preprocessed = df_preprocessed.dropna(how='any')
df_preprocessed = df_preprocessed.sample(n=30000)
data_dict = get_dataset(df_preprocessed)
# data_dict = load_pickle('./tmp/data_dict.pkl')
save_pickle(data_dict,'./tmp/data_dict.pkl')
model_list = ['mmbt','bert','img','concatbert']
# model_list = ['concatbert','mmbt'] 
# model_list = ['mmbt']
for model in model_list:
    save_model_name = model+'_'+target_col +'_'+ n_classes+'classes'#保存するモデル名
    train_and_report(model)

INFO - 11/17/21 00:19:23 - 2:18:55 - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\atom\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO - 11/17/21 00:19:23 - 2:18:56 - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\atom\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO - 11/17/21 00:19:24 - 2:18:56 - loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\atom\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO - 11/17/21 00:19:24 - 2:18:56

self.all_head_size 768


INFO - 11/17/21 00:19:28 - 0:00:00 - batch_cnt: 1
                                     batch_sz: 4
                                     bert_model: bert-base-uncased
                                     data_path: ./datasets/food101
                                     drop_img_percent: 0.0
                                     dropout: 0.1
                                     embed_sz: 300
                                     freeze_img: 3
                                     freeze_txt: 5
                                     glove_path: ./glove_embeds/glove.840B.300d.txt
                                     gradient_accumulation_steps: 40
                                     hidden: []
                                     hidden_sz: 768
                                     i_epoch: 1
                                     img_embed_pool_type: avg
                                     img_hidden_sz: 2048
                                     include_bn: True
                               

使用デバイス： cuda:0


INFO - 11/17/21 00:19:28 - 0:00:00 - Training..


学習0回目


INFO - 11/17/21 00:23:54 - 0:04:26 - Train Loss: 0.0374
INFO - 11/17/21 00:23:54 - 0:04:26 - Val: Loss: 0.96349 | Acc: 0.66295


学習1回目


INFO - 11/17/21 00:28:22 - 0:08:54 - Train Loss: 0.0167
INFO - 11/17/21 00:28:22 - 0:08:54 - Val: Loss: 0.37451 | Acc: 0.87103


学習2回目


INFO - 11/17/21 00:33:02 - 0:13:34 - Train Loss: 0.0085
INFO - 11/17/21 00:33:02 - 0:13:34 - Val: Loss: 0.26158 | Acc: 0.91220


学習3回目


INFO - 11/17/21 00:40:23 - 0:20:55 - Train Loss: 0.0060
INFO - 11/17/21 00:40:23 - 0:20:55 - Val: Loss: 0.23473 | Acc: 0.92510


学習4回目


INFO - 11/17/21 00:47:24 - 0:27:56 - Train Loss: 0.0046
INFO - 11/17/21 00:47:24 - 0:27:56 - Val: Loss: 0.24535 | Acc: 0.91939


学習5回目


INFO - 11/17/21 00:54:52 - 0:35:24 - Train Loss: 0.0043
INFO - 11/17/21 00:54:52 - 0:35:24 - Val: Loss: 0.25764 | Acc: 0.91716


学習6回目


INFO - 11/17/21 01:02:21 - 0:42:53 - Train Loss: 0.0030
INFO - 11/17/21 01:02:21 - 0:42:53 - Val: Loss: 0.23941 | Acc: 0.93006


学習7回目


INFO - 11/17/21 01:09:47 - 0:50:19 - Train Loss: 0.0019
INFO - 11/17/21 01:09:47 - 0:50:19 - Val: Loss: 0.22983 | Acc: 0.93651


学習8回目


INFO - 11/17/21 01:17:16 - 0:57:48 - Train Loss: 0.0012
INFO - 11/17/21 01:17:16 - 0:57:48 - Val: Loss: 0.26248 | Acc: 0.93403


学習9回目


INFO - 11/17/21 01:24:42 - 1:05:14 - Train Loss: 0.0009
INFO - 11/17/21 01:24:42 - 1:05:14 - Val: Loss: 0.24551 | Acc: 0.93874


学習10回目


INFO - 11/17/21 01:32:13 - 1:12:45 - Train Loss: 0.0006
INFO - 11/17/21 01:32:13 - 1:12:45 - Val: Loss: 0.26373 | Acc: 0.94122


学習11回目


INFO - 11/17/21 01:39:48 - 1:20:20 - Train Loss: 0.0004
INFO - 11/17/21 01:39:48 - 1:20:20 - Val: Loss: 0.26560 | Acc: 0.93948


学習12回目


INFO - 11/17/21 01:47:33 - 1:28:05 - Train Loss: 0.0002
INFO - 11/17/21 01:47:33 - 1:28:05 - Val: Loss: 0.27715 | Acc: 0.93973


学習13回目


INFO - 11/17/21 01:54:56 - 1:35:28 - Train Loss: 0.0002
INFO - 11/17/21 01:54:56 - 1:35:28 - Val: Loss: 0.27333 | Acc: 0.94271


学習14回目


INFO - 11/17/21 02:02:26 - 1:42:58 - Train Loss: 0.0002
INFO - 11/17/21 02:02:26 - 1:42:58 - Val: Loss: 0.28671 | Acc: 0.94147


学習15回目


INFO - 11/17/21 02:09:48 - 1:50:20 - Train Loss: 0.0001
INFO - 11/17/21 02:09:48 - 1:50:20 - Val: Loss: 0.30033 | Acc: 0.94444


学習16回目


INFO - 11/17/21 02:17:15 - 1:57:47 - Train Loss: 0.0001
INFO - 11/17/21 02:17:15 - 1:57:47 - Val: Loss: 0.30658 | Acc: 0.94370


学習17回目


INFO - 11/17/21 02:24:43 - 2:05:15 - Train Loss: 0.0001
INFO - 11/17/21 02:24:43 - 2:05:15 - Val: Loss: 0.32511 | Acc: 0.94320


学習18回目


INFO - 11/17/21 02:32:06 - 2:12:38 - Train Loss: 0.0001
INFO - 11/17/21 02:32:06 - 2:12:38 - Val: Loss: 0.29964 | Acc: 0.94544


学習19回目


INFO - 11/17/21 02:39:32 - 2:20:04 - Train Loss: 0.0000
INFO - 11/17/21 02:39:32 - 2:20:04 - Val: Loss: 0.29544 | Acc: 0.94643


elapsed_time:8412.821453094482[sec]


INFO - 11/17/21 02:39:39 - 2:20:11 - loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\atom\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO - 11/17/21 02:39:39 - 2:20:11 - extracting archive file C:\Users\atom\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir C:\Users\atom\AppData\Local\Temp\tmpjmp7caet
INFO - 11/17/21 02:39:42 - 2:20:14 - Model config {
                                       "attention_probs_dropout_prob": 0.1,
                                       "hidden_act": "gelu",
                                       "hidden_dropout_prob": 0.1,
                                       "hidden_size": 768,
                                       "initializer_range": 0.02,
              

self.all_head_size 768


INFO - 11/17/21 02:41:27 - 2:21:59 - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\atom\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO - 11/17/21 02:41:28 - 2:22:00 - loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\atom\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO - 11/17/21 02:41:29 - 2:22:01 - loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\atom\.pytorch_pretrained_bert\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO - 11/17/21 02:41:29 - 2:22:01

self.all_head_size 768


INFO - 11/17/21 02:41:32 - 0:00:00 - batch_cnt: 1
                                     batch_sz: 4
                                     bert_model: bert-base-uncased
                                     data_path: ./datasets/food101
                                     drop_img_percent: 0.0
                                     dropout: 0.1
                                     embed_sz: 300
                                     freeze_img: 3
                                     freeze_txt: 5
                                     glove_path: ./glove_embeds/glove.840B.300d.txt
                                     gradient_accumulation_steps: 40
                                     hidden: []
                                     hidden_sz: 768
                                     i_epoch: 1
                                     img_embed_pool_type: avg
                                     img_hidden_sz: 2048
                                     include_bn: True
                               

使用デバイス： cuda:0
学習0回目


INFO - 11/17/21 02:44:36 - 0:03:04 - Train Loss: 0.0248
INFO - 11/17/21 02:44:36 - 0:03:04 - Val: Loss: 0.27729 | Acc: 0.91022


学習1回目


In [None]:
stop

In [None]:
model_list = ['mmbt']
for model in model_list:
    save_model_name = model+'_'+target_col +'_'+ n_classes+'classes'#保存するモデル名
    train_and_report(model)

In [None]:
df_result_bert_ctg_10classes = load_pickle('./result/df_result_bert_ctg_10classes.pkl')
df_result_bert_ctg_10classes

In [None]:
df_result_bert_ctg_34classes = load_pickle('./result/df_result_bert_ctg_34classes.pkl')
df_result_bert_ctg_34classes

In [None]:
df_result_img_ctg_34classes = load_pickle('./result/df_result_img_ctg_34classes.pkl')
df_result_img_ctg_34classes

In [None]:
df_result_bert_ctg_34classes = load_pickle('./result/df_result_bert_ctg_34classes.pkl')
df_result_img_ctg_34classes = load_pickle('./result/df_result_img_ctg_34classes.pkl')
df_result_concatbert_ctg_34classes = load_pickle('./result/df_result_concatbert_ctg_34classes.pkl')
df_result_mmbt_ctg_34classes = load_pickle('./result/df_result_mmbt_ctg_34classes.pkl')
df_result_total = pd.concat([df_result_bert_ctg_34classes,df_result_img_ctg_34classes,df_result_concatbert_ctg_34classes,df_result_mmbt_ctg_34classes])
df_result_total.sort_values('macro_f1',ascending=False)

In [9]:
df_result_bert_ctg_10classes = load_pickle('./result/df_result_bert_ctg_10classes.pkl')
df_result_img_ctg_10classes = load_pickle('./result/df_result_img_ctg_10classes.pkl')
df_result_concatbert_ctg_10classes = load_pickle('./result/df_result_concatbert_ctg_10classes.pkl')
df_result_mmbt_ctg_10classes = load_pickle('./result/df_result_mmbt_ctg_10classes.pkl')
df_result_total = pd.concat([df_result_bert_ctg_10classes,df_result_img_ctg_10classes,df_result_concatbert_ctg_10classes,df_result_mmbt_ctg_10classes])
df_result_total.sort_values('macro_f1',ascending=False)

Unnamed: 0,model_name,target_col,batch_sz,macro_f1,accuracy
0,modelmmbt_ctg_10classes.pt,ctg,4,0.900551,0.9016
0,modelbert_ctg_10classes.pt,ctg,4,0.897096,0.8984
0,modelconcatbert_ctg_10classes.pt,ctg,4,0.881301,0.8816
0,modelimg_ctg_10classes.pt,ctg,4,0.777782,0.7776


In [11]:
df_result_bert_ctg_10classes = load_pickle('./result/df_result_bert_ctg_10classes.pkl')
df_result_img_ctg_10classes = load_pickle('./result/df_result_img_ctg_10classes.pkl')
df_result_concatbert_ctg_10classes = load_pickle('./result/df_result_concatbert_ctg_10classes.pkl')
df_result_mmbt_ctg_10classes = load_pickle('./result/df_result_mmbt_ctg_10classes.pkl')
df_result_total = pd.concat([df_result_bert_ctg_10classes,df_result_img_ctg_10classes,df_result_concatbert_ctg_10classes,df_result_mmbt_ctg_10classes])
df_result_total.sort_values('macro_f1',ascending=False)

Unnamed: 0,model_name,target_col,batch_sz,macro_f1,accuracy
0,modelmmbt_ctg_10classes.pt,ctg,4,0.931266,0.9312
0,modelconcatbert_ctg_10classes.pt,ctg,4,0.927045,0.926667
0,modelbert_ctg_10classes.pt,ctg,4,0.922909,0.924533
0,modelimg_ctg_10classes.pt,ctg,4,0.828503,0.8312


In [4]:
df_result_bert_ctg_10classes = load_pickle('./result/df_result_bert_ctg_10classes.pkl')
df_result_img_ctg_10classes = load_pickle('./result/df_result_img_ctg_10classes.pkl')
df_result_concatbert_ctg_10classes = load_pickle('./result/df_result_concatbert_ctg_10classes.pkl')
df_result_mmbt_ctg_10classes = load_pickle('./result/df_result_mmbt_ctg_10classes.pkl')
df_result_total = pd.concat([df_result_bert_ctg_10classes,df_result_img_ctg_10classes,df_result_concatbert_ctg_10classes,df_result_mmbt_ctg_10classes])
df_result_total.sort_values('macro_f1',ascending=False)

Unnamed: 0,model_name,target_col,batch_sz,macro_f1,accuracy
0,modelmmbt_ctg_10classes.pt,ctg,4,0.939516,0.9396
0,modelconcatbert_ctg_10classes.pt,ctg,4,0.927045,0.926667
0,modelbert_ctg_10classes.pt,ctg,4,0.922909,0.924533
0,modelimg_ctg_10classes.pt,ctg,4,0.828503,0.8312


In [7]:
df_result_mmbt_ctg_10classes = load_pickle('./result/df_result_mmbt_ctg_10classes.pkl')
df_result_mmbt_ctg_10classes

AttributeError: Can't get attribute 'new_block' on <module 'pandas.core.internals.blocks' from 'C:\\Users\\atom\\anaconda3\\lib\\site-packages\\pandas\\core\\internals\\blocks.py'>

In [None]:
df_result_bert_ctg_4classes = load_pickle('./result/df_result_bert_ctg_4classes.pkl')
df_result_img_ctg_4classes = load_pickle('./result/df_result_img_ctg_4classes.pkl')
df_result_concatbert_ctg_4classes = load_pickle('./result/df_result_concatbert_ctg_4classes.pkl')
df_result_mmbt_ctg_4classes = load_pickle('./result/df_result_mmbt_ctg_4classes.pkl')
df_result_total = pd.concat([df_result_bert_ctg_4classes,df_result_img_ctg_4classes,df_result_concatbert_ctg_4classes,df_result_mmbt_ctg_4classes])
df_result_total.sort_values('macro_f1',ascending=False)

In [None]:
data_dict['train']

In [None]:
stop

# モデル単体で実行

In [None]:
# # データ読み込み
# df = pd.read_table('./datasets/data.tsv')
# # 前処理
# df_preprocessed = get_preprocessed_data(df,target_col)
# # データセット作成＆保存
data_dict = get_dataset(df_preprocessed)
# save_pickle(data_dict,'./tmp/data_dict.pkl')

# 変数設定
target_dataset = 'ebay_codes_20210815_message_prediction' # データセット名
target_col = 'ctg' #目的変数のカラム名
model = 'mmbt'# mmbt/img/bert/concatbert
bert_model = 'bert-base-uncased' # bertモデル　bert-large-uncased　bert-base-uncased
image_model = 'resnet152' # imageモデル ※将来的にresnet152以外のモデルを使う場合に使用
batch_sz = '32' #　32 バッチサイズ
max_epochs = '20' # 学習回数
n_classes = '20'
save_model_name = model+'_'+target_col +'_'+ n_classes+'classes'#保存するモデル名

# モデルの各種設定
args = train.get_args(target_col,bert_model,batch_sz,max_epochs,model,n_classes)
dataloaders_dict = get_data_loaders(args,data_dict)

# 学習
train.train(args,dataloaders_dict)

# モデルを名前を付けて保存
shutil.copyfile('./savedir/mmbt_model_run/model_best.pt','./savedir/mmbt_model_run/'+save_model_name+'.pt')

# 評価
classification_report,confusion_matrix,detail_list,attention_probs = train.test(args,save_model_name,dataloaders_dict)
# print(pd.DataFrame(classification_report).transpose())
# 保存
save_classification_report(classification_report)

# 結果を保存
df_result = get_df_result(classification_report)
result_dir = './result/'
save_pickle(df_result,result_dir+'df_result_'+save_model_name+'.pkl')





In [None]:
df_result = get_df_result(classification_report)
result_dir = './result/'
save_pickle(df_result,result_dir+'df_result_'+save_model_name+'.pkl')

In [None]:
classification_report

In [None]:
stop

In [None]:
# 評価
classification_report,confusion_matrix,detail_list = train.test(args,save_model_name,dataloaders_dict,target_dataset)
# print(pd.DataFrame(classification_report).transpose())
# 保存
save_classification_report(classification_report)

# 結果を保存
df_result = get_df_result()
result_dir = './result/'
save_pickle(df_result,result_dir+'result'+save_model_name+'.pkl')

In [None]:
stop

In [None]:
# パラメータ更新を確認
import glob
import os

files = glob.glob("./savedir/mmbt_model_run/*")
target_files =[]
for file in files:
    if 'epoch' in file:
        target_files.append(file)
dfs = []
for file in target_files:
    path = file
    state_dict = load_pickle(path)
    keys_list = list(state_dict.keys())
    d = []
    for k in keys_list:
        p = state_dict[k]
        
        d.append({
            "epoch":os.path.basename(path)[6],
#             "batch_cnt":os.path.basename(path)[6],
            "name": k,
            "dim": p.shape,
            "val": float(torch.sum(p)),
            "requires_grad": p.requires_grad
        })
    vec_df =  pd.DataFrame(d)
    dfs.append(vec_df)
df = pd.concat(dfs, ignore_index=True)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df_s = df.sort_values(['name', 'epoch'], ascending=[True, True])
df_s_gb = df_s.groupby(['name']).val.agg(['min', 'max'])
df_s_gb['equal'] =  (df_s_gb['min'] == df_s_gb['max'])
df_s_gb_not_equal = df_s_gb[df_s_gb['equal'] == False]
df_s_gb_not_equal

In [None]:
stop

In [None]:
df_s_gb_equal = df_s_gb[df_s_gb['equal'] == True]
df_s_gb_equal.to_csv("df_s_gb_equal.csv")
df_s_gb_equal_img_encoder = df_s_gb_equal.query('name.str.contains("img_encoder")')
df_s_gb_equal_img_encoder.to_csv("df_s_gb_equal_img_encoder.csv")

In [None]:
stop

In [None]:
# パラメータ更新を確認
import glob
import os

files = glob.glob("C:/Users/atom/git/mmbt-master/savedir/mmbt_model_run/*")
target_files =[]
for file in files:
    if 'batch' in file:
        target_files.append(file)
print(target_files)
cnt = 0
dfs = []
for file in target_files:
    path = file
    state_dict = load_pickle(path)
    keys_list = list(state_dict.keys())
    d = []
    for k in keys_list:
        p = state_dict[k]
        d.append({
            "epoch":os.path.basename(path)[8],
            "batch_cnt":os.path.basename(path)[6],
            "name": k,
            "dim": p.shape,
            "val": float(torch.sum(p)),
            "requires_grad": p.requires_grad
        })
    cnt += 1
    vec_df =  pd.DataFrame(d)
    dfs.append(vec_df)
df = pd.concat(dfs, ignore_index=True)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df_s = df.sort_values(['name', 'epoch'], ascending=[True, True])
df_s_gb = df_s.groupby(['name','epoch']).val.agg(['min', 'max'])
df_s_gb['equal'] =  (df_s_gb['min'] == df_s_gb['max'])
df_s_gb_not_equal = df_s_gb[df_s_gb['equal'] == False]
df_s_gb_not_equal

In [None]:
import numpy as np

def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
def get_mmbt_vec(i_epoch):

    input_txt = load_pickle('./tmp/'+str(i_epoch)+'/'+'input_txt.pkl')
    txt_embed_out = load_pickle('./tmp/'+str(i_epoch)+'/'+'txt_embed_out.pkl')
    encoder_input = load_pickle('./tmp/'+str(i_epoch)+'/'+'encoder_input.pkl')
    encoded_layers = load_pickle('./tmp/'+str(i_epoch)+'/'+'encoded_layers.pkl')
    pooler = load_pickle('./tmp/'+str(i_epoch)+'/'+'self.pooler(encoded_layers[-1]).pkl')
    return input_txt,txt_embed_out,encoder_input,encoded_layers,pooler

i_epoch = 0
input_txt_0,txt_embed_out_0,encoder_input_0,encoded_layers_0,pooler_0 = get_mmbt_vec(i_epoch)

i_epoch = 1
input_txt_1,txt_embed_out_1,encoder_input_1,encoded_layers_1,pooler_1 = get_mmbt_vec(i_epoch)

i_epoch = 2
input_txt_2,txt_embed_out_2,encoder_input_2,encoded_layers_2,pooler_2 = get_mmbt_vec(i_epoch)

i_epoch = 3
input_txt_3,txt_embed_out_3,encoder_input_3,encoded_layers_3,pooler_3 = get_mmbt_vec(i_epoch)

i_epoch = 4
input_txt_4,txt_embed_out_4,encoder_input_4,encoded_layers_4,pooler_4 = get_mmbt_vec(i_epoch)

i_epoch = 5
input_txt_5,txt_embed_out_5,encoder_input_5,encoded_layers_5,pooler_5 = get_mmbt_vec(i_epoch)

i_epoch = 6
input_txt_6,txt_embed_out_6,encoder_input_6,encoded_layers_6,pooler_6 = get_mmbt_vec(i_epoch)

i_epoch = 7
input_txt_7,txt_embed_out_7,encoder_input_7,encoded_layers_7,pooler_7 = get_mmbt_vec(i_epoch)

In [None]:
print(encoded_layers_7)

In [None]:
args.vocab.stoi["[CLS]"]
args.vocab.itos[3244]#6
args.vocab.itos[4524]#7　bag
args.vocab.itos[7829]# shipping

In [None]:
# 4524：bag
print(input_txt_0[0][4])# bagの単語id
print(input_txt_0[2][10])# bagの単語id
# print(txt_embed_out_1[1][3]) # bagの768次元ベクトル
v1 = txt_embed_out_0[2][10].to('cpu').detach().numpy().copy()
sum(v1)

In [None]:
# コサイン類似度を調べる bag
def get_i_epoch_vec_list():
    i_epoch = 0
    i_epoch_vec_list = []
    for i in range(8):
        epoch_num = i
        txt_embed_out = load_pickle('./tmp/'+str(epoch_num)+'/'+'txt_embed_out.pkl')
        v1 = txt_embed_out[0][4].to('cpu').detach().numpy().copy()
        i_epoch_vec = [epoch_num,'bag',sum(v1)]
        i_epoch_vec_list.append(i_epoch_vec)
    return i_epoch_vec_list
i_epoch_vec_list = get_i_epoch_vec_list()
df_i_epoch_vec = pd.DataFrame(i_epoch_vec_list,
                  columns=['i_epoch','token_name', 'sum_txt_embed_out' ])

df_i_epoch_vec.to_csv('df_i_epoch_vec_step_false.csv')
df_i_epoch_vec
# v2 = txt_embed_out_7[2][10].to('cpu').detach().numpy().copy()
# print(('sum(v1)',sum(v1)),('sum(v2)',sum(v2)))

In [None]:
# 2つ文章に登場するbagの単語類似度を算出
v1 = encoder_input_1[1][10].to('cpu').detach().numpy().copy() # encoder_input_1[3][1:3]はRGBだから画像部分のベクトル変化する
v2 = encoder_input_5[4][2].to('cpu').detach().numpy().copy()
# v2 = encoder_input_5[3][1].to('cpu').detach().numpy().copy()
print(cos_sim(v1, v2))

In [None]:
# 2つ文章に登場するbagの単語類似度を算出
v1 = txt_embed_out_0[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_0[4][2].to('cpu').detach().numpy().copy()
print(cos_sim(v1, v2))

In [None]:
# コサイン類似度を調べる bag
v1 = txt_embed_out_7[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_7[4][2].to('cpu').detach().numpy().copy()
print(cos_sim(v1, v2))

In [None]:
# コサイン類似度を調べる bag
v1 = txt_embed_out_0[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_1[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_2[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_3[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_4[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_5[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_6[1][10].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_7[1][10].to('cpu').detach().numpy().copy()
print(cos_sim(v1, v2))

In [None]:

# コサイン類似度を調べる bag
v1 = encoded_layers_0[1][10].to('cpu').detach().numpy().copy()
v2 = encoded_layers_1[1][10].to('cpu').detach().numpy().copy()
v2 = encoded_layers_2[1][10].to('cpu').detach().numpy().copy()
v2 = encoded_layers_3[1][10].to('cpu').detach().numpy().copy()
v2 = encoded_layers_4[1][10].to('cpu').detach().numpy().copy()
v2 = encoded_layers_5[1][10].to('cpu').detach().numpy().copy()
# v2 = encoded_layers_6[1][10].to('cpu').detach().numpy().copy()
# v2 = encoded_layers_7[1][10].to('cpu').detach().numpy().copy()
print(cos_sim(v1, v2))

In [None]:
# コサイン類似度を調べる
v1 = txt_embed_out_1[1][3].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_1[3][4].to('cpu').detach().numpy().copy()
print(cos_sim(v1, v2))

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

def get_2d_vector(vec):
    df = pd.DataFrame(vec)
    model_tsne = TSNE(n_components=2, perplexity=2)
    vecs_list = model_tsne.fit_transform(df)
    df_add_vecs_list =  pd.DataFrame(vecs_list)
    return df_add_vecs_list,vecs_list
# v1 = encoder_input[0][7].to('cpu').detach().numpy().copy()
# v2 = encoded_layers[0][7].to('cpu').detach().numpy().copy()

v1 = txt_embed_out_1[4][0].to('cpu').detach().numpy().copy()
v2 = txt_embed_out_2[4][0].to('cpu').detach().numpy().copy()

print(cos_sim(v1, v2))
df_add_vecs_list_v1,v1_2d = get_2d_vector(v1)
df_add_vecs_list_v2,v2_2d = get_2d_vector(v2)
df_add_vecs_list_v1['name'] = 'encoder_input[0][7]'
df_add_vecs_list_v2['name'] = 'encoded_layers[0][7]'
df_add_vecs_list_cat = pd.concat([df_add_vecs_list_v1, df_add_vecs_list_v2])

import seaborn as sns
import matplotlib.pyplot as plt
 
sns.set(font="Hiragino Maru Gothic Pro")
plt.figure(figsize=(8, 8))
plt.title('bag_vector')
sns.scatterplot(data=df_add_vecs_list_cat, x=0, y=1, hue='name')
# for i,(x_name,y_name) in enumerate(zip(X,Y)):
#     plt.annotate(df.index[i],(x_name,y_name))
plt.show()

In [None]:
import os

for i in range(32):
    try:
        new_dir_path = './tmp/'+str(i)
        os.mkdir(new_dir_path)
    except:
        continue

In [None]:
stop atomアカウント

# 本番データで予測

In [None]:
# 使用データセットを定義
target_dataset = 'ebay_20210822_msg_prediction'

# データ読み込み
df = pd.read_table('./datasets/'+target_dataset+'/data.tsv')

# カラム追加
df_addcol = get_df_addcol(df)

# 推論
model_name = 'model_msg_cnt_flag_variables3_ep20' # MSGの有無を予測するモデル
df_preprocessed_msg = get_preds_score(df,model_name)

# 推論
model_name = 'model_sold_flag_variables3_ep20' # soldの有無を予測するモデル
df_preprocessed_sold = get_preds_score(df,model_name)

# 推論結果を元データに追加
df_preprocessed_msg = df_preprocessed_msg.rename(columns={'preds_score': 'msg_preds_score'})
df_preprocessed_msg = df_preprocessed_msg.loc[:,['img','msg_preds_score']]
df_add_msg_preds_score =pd.merge(df_addcol, df_preprocessed_msg, how='left',on='img')

# 推論結果を元データに追加
df_preprocessed_sold = df_preprocessed_sold.rename(columns={'preds_score': 'sold_preds_score'})
df_preprocessed_sold = df_preprocessed_sold.loc[:,['img','sold_preds_score']]
df_add_msg_sold_preds_score = pd.merge(df_add_msg_preds_score, df_preprocessed_sold, how='left',on='img')
df_add_msg_sold_preds_score.head(1)

# 結果を保存
df_add_msg_sold_preds_score.to_csv('./tmp/ebay_20210822_msg_prediction/add_preds_score_data.tsv', sep="\t", index=False)

# データをロード
df_add_msg_sold_preds_score = pd.read_table('./tmp/ebay_20210822_msg_prediction/add_preds_score_data.tsv')

# 推論結果をいい感じに整形
df_add_msg_sold_preds_score['msg_first_pred_score'] = df_add_msg_sold_preds_score.msg_preds_score.apply(lambda x:get_first_pred_score(x))
df_add_msg_sold_preds_score['sold_first_pred_score'] = df_add_msg_sold_preds_score.sold_preds_score.apply(lambda x:get_first_pred_score(x))
df_add_msg_sold_preds_score['msg_pred'] = df_add_msg_sold_preds_score.msg_preds_score.apply(lambda x:get_pred(x))
df_add_msg_sold_preds_score['sold_pred'] = df_add_msg_sold_preds_score.sold_preds_score.apply(lambda x:get_pred(x))

In [None]:
from mmbt.models import get_model
from mmbt.utils.utils import *
target_dataset = 'ebay_20210822_msg_prediction'
model_name = '20210816_msgcntflag_no1_title_img_stdprice_ep20'
model = get_model(args)
load_checkpoint(model, './savedir/'+target_dataset+'/mmbt_model_run/'+model_name+'.pt')

In [None]:
param_optimizer = [('a','b'),('c','d'),('e','f')]
no_decay = ['e', 'b']
optimizer_grouped_parameters =[
    {'params':[p for n, p in param_optimizer if not any((nd in n for nd in no_decay))], 
    'weight_decay':0.01},
    {'params':[p for n, p in param_optimizer if any((nd in n for nd in no_decay))],'weight_decay':0.0}]
optimizer_grouped_parameters

In [None]:
[p for n, p in param_optimizer if not any((nd in n for nd in no_decay))]

In [None]:
df_ = pd.DataFrame(columns=df.columns)

# 価格をカテゴリごとに標準化
class_ = df[['ctg']]
class_names = df.groupby('ctg').groups.keys()
data = []
for name in class_names:
    df_tmp = df[(df['ctg'] == name)].drop(columns=['ctg'])
    df_tmp['std_price'] = (df_tmp['price_x'] - df_tmp['price_x'].mean())/df_tmp['price_x'].std()    
    df_ = pd.concat([df_, df_tmp])
df_['ctg'] = class_

# 前処理
df_tmp = df_
# カテゴリー分類の際は、IDを指定する
if target_col == 'ctg':
    df_tmp = df_tmp.astype({'ctg': str})
    df_tmp = df_tmp[df_tmp['ctg'].isin(['69528','183454','180506','169291','15687'])]
df_tmp["fullpath"] = df_tmp.itemid.apply(lambda x: "images/" + str(x) + ".jpg")
df_tmp['error_imgfile_flg'] = df_tmp['fullpath'].swifter.apply(lambda x:error_imagefile_serch(x))
df_tmp = df_tmp[df_tmp['error_imgfile_flg']==0]
df_tmp["label"] = df_tmp[target_col]
df_tmp['date'] = df_tmp['dt'].apply(lambda x: x[5:7])
df_tmp['img'] = './datasets/' + df_tmp['fullpath']
df_tmp["text"] = df_tmp['title'].str.lower()
df_tmp["text"] = df_tmp['text'].apply(lambda x: " ".join(re.findall(r"[ぁ-んァ-ン一-龥ー'\da-zA-Z\-]+", x)))

df_preprocessed = df_tmp.loc[:,['label','text','img','date','std_price']].reset_index(drop=True)
        

In [None]:
df_train_dev,df_test = train_test_split(df_preprocessed, random_state=0,stratify=df_preprocessed.label)
# 最小データ数でダウンサンプリング
minimum_num = df_train_dev['label'].value_counts().min()
# ラベル毎に最小データ数だけサンプリング
dfs = [d.sample(minimum_num, random_state=0) for name, d in df_train_dev.groupby('label')]
# 結合。ラベル順に並んでいるのでshuffleする
under_resampled_df = pd.concat(dfs).sample(frac=1, random_state=0)
df_train_dev = under_resampled_df
df_train, df_dev = train_test_split(df_train_dev,stratify=df_train_dev.label)
data_dict = {"train": df_train, "val": df_dev,'test':df_test}


In [None]:
df_.groupby('ctg').count().sort_values('itemid', ascending=False)

In [None]:
class_

In [None]:
from mmbt.models import get_model
from mmbt.utils.utils import *
model = get_model(args)
load_checkpoint(model, './savedir/ebay_codes_20210815_message_prediction/mmbt_model_run/model_best.pt')


In [None]:
# コサイン類似度を調べる bag
target_layers = 'encoded_layers'
def get_i_epoch_vec_list():
    i_epoch = 0
    i_epoch_vec_list = []
    for i in range(8):
        epoch_num = i
        encoded_layers = load_pickle('./tmp/'+str(epoch_num)+'/'+target_layers+'.pkl')
        v1 = encoded_layers[0][4].to('cpu').detach().numpy().copy()
        i_epoch_vec = [epoch_num,'bag',sum(v1)]
        i_epoch_vec_list.append(i_epoch_vec)
    return i_epoch_vec_list
i_epoch_vec_list = get_i_epoch_vec_list()
df_i_epoch_vec = pd.DataFrame(i_epoch_vec_list,
                  columns=['i_epoch','token_name', 'sum_txt_embed_out' ])
df_i_epoch_vec
# v2 = txt_embed_out_7[2][10].to('cpu').detach().numpy().copy()
# print(('sum(v1)',sum(v1)),('sum(v2)',sum(v2)))

In [None]:
# Grad-CAM
from gradcam.utils import visualize_cam
from gradcam import GradCAM, GradCAMpp

target_layer = model.module.features
gradcam = GradCAM(model, target_layer)
gradcam_pp = GradCAMpp(model, target_layer)


production_loader = dataloaders_dict['production']
model = get_model(args)
load_checkpoint(model, './savedir/mmbt_model_run/model_best.pt')
criterion = get_criterion(args)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# ネットワークをGPUへ
model.to(device)

# 評価
# preds_score = model_production(production_loader, model)


In [None]:
# 本番データでの予測の際に使用する関数
# 予測値の確率を出力

    target_dataset = 'ebay_codes_20210815_message_prediction' # データセット名
    model = 'mmbt'
    bert_model = 'bert-base-uncased' # bertモデル　bert-large-uncased　bert-base-uncased
    image_model = 'resnet152' # imageモデル
    batch_sz = '32' # 不要
    max_epochs = '10' # 不要
    target_col = 'none' # 不要
    
    # データ読み込み
    df = df.loc[:][['mk_title','url','selling_price','category_id']]
    df['title'] = df['mk_title']

    # 前処理
    categoryid = 'category_id'
    price = 'selling_price'
    df_preprocessed = get_preprocessed_data_for_production(df,categoryid,price)
    # データセット作成＆保存
    data_dict = get_dataset_for_production(df_preprocessed)
    # モデルの各種設定
    args = train.get_args(target_col,bert_model,batch_sz,max_epochs,model)
    dataloaders_dict = get_data_loaders_for_production(args,data_dict)

    # 評価
    preds_score = train.production(args,model_name,dataloaders_dict)
    df_preprocessed['preds_score'] = preds_score
    df_preprocessed = df_preprocessed.reset_index(drop=True)
    df_preprocessed


In [None]:
target_dataset = 'ebay_codes_20210815_message_prediction' # データセット名

# データ読み込み
df = pd.read_table('./datasets/'+target_dataset+'/data.tsv')

# カラム追加
df_addcol = get_df_addcol(df)

# 推論
model_name = 'model_best' # MSGの有無を予測するモデル


model = 'mmbt'
bert_model = 'bert-base-uncased' # bertモデル　bert-large-uncased　bert-base-uncased
image_model = 'resnet152' # imageモデル
batch_sz = '32' # 不要
max_epochs = '10' # 不要
target_col = 'none' # 不要

# データ読み込み
df = df.loc[:][['mk_title','url','selling_price','category_id']]
df['title'] = df['mk_title']

# 前処理
categoryid = 'category_id'
price = 'selling_price'
df_preprocessed = get_preprocessed_data_for_production(df,target_dataset,categoryid,price)
# データセット作成＆保存
data_dict = get_dataset_for_production(df_preprocessed)
# モデルの各種設定
args = train.get_args(target_col,bert_model,batch_sz,max_epochs,model)
dataloaders_dict = get_data_loaders_for_production(args,data_dict)

# 評価
preds_score = train.production(args,model_name,dataloaders_dict)
df_preprocessed['preds_score'] = preds_score
df_preprocessed = df_preprocessed.reset_index(drop=True)



In [None]:
# Basic Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# PyTorch Modules
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms, datasets
import torchvision.transforms as transforms
from torch.utils.data.dataset import Subset
import torchvision.models as models
import torch.optim as optim
from torchvision.utils import make_grid, save_image

# Grad-CAM
from gradcam.utils import visualize_cam
from gradcam import GradCAM, GradCAMpp
from mmbt.models import get_model
from mmbt.utils.utils import *
class Resnet(nn.Module):
    def __init__(self):
        super(Resnet, self).__init__()
        self.img_encoder = model.enc.img_encoder
        self.img_embeddings = model.enc.img_embeddings
        self.num_image_embeds = 3
    def forword(self, img_input):
        img_tok = (
            torch.LongTensor(32, self.num_image_embeds)
            .fill_(0)
            .cuda()
        )
        img = self.img_encoder(input_img)  # BxNx3x224x224 -> BxNx2048
        img_embed_out = self.img_embeddings(img, img_tok)
        return img_embed_out

device = torch.device("cuda:0" if torch.cuda.is_available()  else "cpu")

# model.fc = nn.Linear(2048,5)
# model = torch.nn.DataParallel(model).to(device)
model.eval()

def get_criterion(args):

    criterion = nn.CrossEntropyLoss()
    return criterion

# Grad-CAM


# target_layer = model.enc.img_encoder
# resnet = Resnet()
model_resnet = models.resnet152(pretrained=True)
target_layer_resnet = model_resnet.layer4
# modules = list(model.children())[:-2]
# model = nn.Sequential(*modules)

gradcam = GradCAM(model_resnet, target_layer_resnet)
gradcam_pp = GradCAMpp(model_resnet, target_layer_resnet)

images = []
# img = Image.open('./datasets/ebay_codes_20210815_message_prediction/images/324727872736.jpg')
img = Image.open('./datasets/ebay_codes_20210815_message_prediction/images/255066960205.jpg')
torch_img = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])(img)
normed_torch_img = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])(torch_img)[None]

# normed_torch_img = normed_torch_img.unsqueeze(0)
mask, _ = gradcam(normed_torch_img)
heatmap, result = visualize_cam(mask, torch_img)

mask_pp, _ = gradcam_pp(normed_torch_img)
heatmap_pp, result_pp = visualize_cam(mask_pp, torch_img)

images.extend([torch_img.cpu(), heatmap, heatmap_pp, result, result_pp])
grid_image = make_grid(images, nrow=5)

# 結果の表示
transforms.ToPILImage()(grid_image)

In [None]:
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
#貼り付け

plt.imshow(Image.fromarray(np.array(torch_img)))
#表示
plt.show()

# MMBT

In [None]:
# Basic Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# PyTorch Modules
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms, datasets
import torchvision.transforms as transforms
from torch.utils.data.dataset import Subset
import torchvision.models as models
import torch.optim as optim
from torchvision.utils import make_grid, save_image

# Grad-CAM
from gradcam.utils import visualize_cam
from gradcam import GradCAM, GradCAMpp
from mmbt.models import get_model
from mmbt.utils.utils import *

device = torch.device("cuda:0" if torch.cuda.is_available()  else "cpu")


# model = torch.nn.DataParallel(model).to(device)

def get_criterion(args):

    criterion = nn.CrossEntropyLoss()
    return criterion

# Grad-CAM
class Resnet(nn.Module):
    def __init__(self):
        super(Resnet, self).__init__()
        self.img_encoder = model.enc.img_encoder
        self.img_embeddings = model.enc.img_embeddings
        self.num_image_embeds = 5
    def forward(self, img_input):
        
        img_tok = (
            torch.LongTensor(1, self.num_image_embeds)
            .fill_(0).to(device)
        )
        img = self.img_encoder(img_input).to(device) # BxNx3x224x224 -> BxNx2048
        img = self.img_embeddings(img, img_tok).to(device)
        return torch.flatten(img, start_dim=1).to(device)
       
class Resnet(nn.Module):
    def __init__(self):
        super(Resnet, self).__init__()
        self.img_encoder = model.enc.img_encoder
        self.img_embeddings = model.enc.img_embeddings
        
    def forward(self, img_input):

        img = self.img_encoder(img_input)  # BxNx3x224x224 -> BxNx2048
        return torch.flatten(img, start_dim=1)
    
model = get_model(args).to(device)
load_checkpoint(model, './savedir/mmbt_model_run/model_best.pt')
model.eval()

target_layer = model.enc.img_encoder.model[4]
# resnet = torchvision.models.resnet152(pretrained=True)
# model.fc = nn.Linear(2048,5)

resnet = Resnet()
gradcam = GradCAM(resnet, target_layer)
gradcam_pp = GradCAMpp(resnet, target_layer)

images = []
# img = Image.open('./datasets/ebay_codes_20210815_message_prediction/images/324727872736.jpg')
img = Image.open('./datasets/ebay_codes_20210815_message_prediction/images/255066960205.jpg')


# torch_img = transforms.Compose([
#     transforms.Resize((224, 224)),
#     transforms.ToTensor()
# ])(img)

# torch_img = transforms.Compose(
#         [
#             transforms.Resize(256),
#             transforms.CenterCrop(224),
#             transforms.ToTensor(),
#             transforms.Normalize(
#                 mean=[0.46777044, 0.44531429, 0.40661017],
#                 std=[0.12221994, 0.12145835, 0.14380469],
#             ),
#         ]
#     )(img).to(device)

torch_img = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])(img)
normed_torch_img = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])(torch_img)[None].to(device)

# normed_torch_img = torch_img.unsqueeze(0)

# normed_torch_img = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])(torch_img)[None]

# normed_torch_img = normed_torch_img.unsqueeze(0)
mask, _ = gradcam(normed_torch_img)
heatmap, result = visualize_cam(mask, torch_img)

mask_pp, _ = gradcam_pp(normed_torch_img)
heatmap_pp, result_pp = visualize_cam(mask_pp, torch_img)

images.extend([torch_img.cpu(), heatmap, heatmap_pp, result, result_pp])
grid_image = make_grid(images, nrow=5)

# 結果の表示
transforms.ToPILImage()(grid_image)