## A demo for training the dual-attn model 

In [1]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn
from tqdm import tqdm
from collections import Counter
from tokenizer import Tokenizer
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score
from dual_attn import DualAttnModel

data = pd.read_csv('listed_web_train.csv')
data = data.dropna().drop_duplicates()
cleaned_content = ['|'.join(list(set(i.split("|")))) for i in data.cleaned_content]
data['cleaned_content'] = cleaned_content
data

Unnamed: 0,hojin_id,company_name,urls,cleaned_content,hightechflag,company_label
0,7010401009665,株式会社広済堂ホールディングス,http://www.kosaido.co.jp/service/communication...,施工|られ|硬質|管理|集計|写真|人材|安全|Copyright|こうした|SERVICE...,0,others
1,7010401009665,株式会社広済堂ホールディングス,http://www.kosaido.co.jp/service/human/,国籍|誇り|就職|誇る|管理|就労|人材|安全|Workin|Copyright|人数|SE...,0,others
2,7010401009665,株式会社広済堂ホールディングス,http://www.kosaido.co.jp/csr/business/,着実|一貫|管理|人材|安全|Copyright|確立|及ぼす|SERVICE|オリジナル|...,0,others
3,7010401009665,株式会社広済堂ホールディングス,http://www.kosaido.co.jp/service/human/#overseas,国籍|誇り|就職|誇る|管理|就労|人材|安全|Workin|Copyright|人数|SE...,0,others
4,7010401009665,株式会社広済堂ホールディングス,http://www.kosaido.co.jp/service/communication...,施工|られ|硬質|管理|集計|写真|人材|安全|Copyright|こうした|SERVICE...,0,others
...,...,...,...,...,...,...
119663,6010401027577,本田技研工業株式会社,http://www.honda.co.jp/guide/?from=navi_drawer,向かう|エンジン|よく|ニュースルームトップ|会社|採用|リリース|モビリティサービス|そし...,1,automobile
119664,6010401027577,本田技研工業株式会社,http://www.honda.co.jp/topics/?from=navi_drawer,エンジン|よく|ニュースルームトップ|会社|採用|リリース|モビリティサービス|モータースポ...,1,automobile
119665,6010401027577,本田技研工業株式会社,http://www.honda.co.jp/recall/?from=navi_header,エンジン|よく|ニュースルームトップ|取扱|心配|会社|採用|リリース|モビリティサービス|...,1,automobile
119666,6010401027577,本田技研工業株式会社,http://www.honda.co.jp/event/?from=navi_drawer,られ|レース|モビリティサービス|人材|安全|モビリティーリゾート|参加|シリーズ|大人|変...,1,automobile


### Data preprocessing

In [2]:
hojin_ids = list(set(data.hojin_id))

sample_data = pd.DataFrame({})
max_page = 32

for hojin_id in hojin_ids:
    temp = data[data.hojin_id == hojin_id]
    if temp.shape[0] <= max_page:
        sample_data = pd.concat([sample_data, temp], ignore_index=True)
    else:
        sample_data = pd.concat([sample_data, temp.iloc[:max_page, :]], ignore_index=True)

num_words = [len(i.split('|')) for i in sample_data.cleaned_content]
sample_data['num_words'] = num_words
sample_data = sample_data[sample_data.num_words > 5]

#### Load in pretrained word vectors

In [3]:
with open('wv_dict_listed.pkl', 'rb') as fp:
    wv_dict = pickle.load(fp)

vectors = np.array(list(wv_dict.values()))
words = list(wv_dict.keys())
vectors_all = np.vstack([np.zeros(300), vectors])
vectors_all = torch.tensor(vectors_all)

#### Tokenization

In [4]:
hojin_ids = list(set(sample_data.hojin_id))
hojin_ids = [int(i) for i in hojin_ids]

tokenizer = Tokenizer(words, max_len=864, data = sample_data)

web_vectors = [tokenizer.encode_webportfolio(company_id=idx, max_page=max_page) for idx in tqdm(hojin_ids)]

seq_ids = torch.tensor([i[1] for i in web_vectors])
num_pages = torch.tensor([i[0] for i in web_vectors])
seq_lengths = tokenizer.max_len - torch.sum(seq_ids == 0, axis=-1)

labels = torch.tensor([tokenizer.get_label(i) for i in hojin_ids])
hojin_ids = torch.tensor(hojin_ids)

100%|██████████| 3829/3829 [00:12<00:00, 316.31it/s]


In [5]:
batch_size = 8
dataset = TensorDataset(seq_ids, num_pages, seq_lengths, labels, hojin_ids)
train_dataloader = DataLoader(dataset, batch_size=batch_size)

In [6]:
def evaluate(data_loader, model):
    model.eval()
    count = 0
    i = 0
    gold_labels = []
    pred_labels = []
    for ind, batch in tqdm(enumerate(data_loader), ncols=80):
                
        seq_ids, num_pages, seq_lengths, label_list, hojin = batch        
        outputs, _, _, _, _, _, _ = model(seq_ids.to(device), num_pages.to(device), seq_lengths.to(device))
        preds = (outputs>0.5).squeeze()

        gold_labels += list(label_list.cpu().numpy())
        pred_labels += list(preds.cpu().numpy())
        num = (preds.cpu() == label_list.bool()).sum().cpu().item()
        count += num
        i += 1
    accuracy = count*1.0/(i * batch_size)
    print('Evaluation accuracy:', accuracy)
    return accuracy

### Model

In [7]:
vectors = np.array(list(wv_dict.values()))
words = list(wv_dict.keys())
vectors_all = np.vstack([np.zeros(300), vectors])

torch.manual_seed(1218)
loss_function = nn.BCELoss()
scale = 10

model = DualAttnModel(vocab_size=len(words)+1, embed_dim=300, hidden_dim=300, 
                             label_dim=1, scale=10, page_scale=10)
model.load_vector(pretrained_vectors=vectors_all, trainable=True)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.02, weight_decay=0.0000, lr_decay=0.001)

embeddings loaded


### Training

In [8]:
best_model = None
best_acc = 0
for i in range(12):
    print('Epoch:', i)
    print('#'*20)
    total_loss = 0
    count = 0
    model.train()
    for ind, batch in tqdm(enumerate(train_dataloader), ncols=80):
        seq_ids, num_pages, seq_lengths, label_list, hojin = batch
        model.zero_grad()
        preds, _, _, _, _, _, _ = model(seq_ids.to(device), num_pages.to(device), seq_lengths.to(device))
        loss = loss_function(preds.squeeze(), label_list.to(device).float())
        loss.backward()
        optimizer.step()
        total_loss += loss.cpu().item()*len(seq_ids)
        count += len(seq_ids)
    print('total_loss:  ', total_loss/count)
    print('Training Accuracy')
    evaluate(train_dataloader, model)

0it [00:00, ?it/s]

Epoch: 0
####################


479it [00:25, 19.05it/s]
5it [00:00, 42.49it/s]

total_loss:   0.51237556945409
Training Accuracy


479it [00:10, 44.91it/s]
2it [00:00, 18.57it/s]

Evaluation accuracy: 0.8215031315240083
Epoch: 1
####################


479it [00:24, 19.32it/s]
5it [00:00, 42.85it/s]

total_loss:   0.41831847727999233
Training Accuracy


479it [00:10, 45.28it/s]
2it [00:00, 18.57it/s]

Evaluation accuracy: 0.8413361169102297
Epoch: 2
####################


479it [00:24, 19.22it/s]
5it [00:00, 42.85it/s]

total_loss:   0.3786822805386917
Training Accuracy


479it [00:10, 45.29it/s]
4it [00:00, 19.28it/s]

Evaluation accuracy: 0.8593423799582464
Epoch: 3
####################


479it [00:24, 19.35it/s]
5it [00:00, 44.37it/s]

total_loss:   0.3550381890983829
Training Accuracy


479it [00:10, 44.77it/s]
4it [00:00, 18.71it/s]

Evaluation accuracy: 0.8679540709812108
Epoch: 4
####################


479it [00:24, 19.35it/s]
5it [00:00, 44.37it/s]

total_loss:   0.3328929867281183
Training Accuracy


479it [00:10, 45.18it/s]
2it [00:00, 18.57it/s]

Evaluation accuracy: 0.877348643006263
Epoch: 5
####################


479it [00:24, 19.34it/s]
5it [00:00, 43.22it/s]

total_loss:   0.31649616631523814
Training Accuracy


479it [00:10, 45.12it/s]
2it [00:00, 18.57it/s]

Evaluation accuracy: 0.8825678496868476
Epoch: 6
####################


479it [00:24, 19.36it/s]
5it [00:00, 42.48it/s]

total_loss:   0.3023038289975983
Training Accuracy


479it [00:10, 45.17it/s]
2it [00:00, 18.68it/s]

Evaluation accuracy: 0.8875260960334029
Epoch: 7
####################


479it [00:24, 19.35it/s]
5it [00:00, 44.36it/s]

total_loss:   0.29181726925419466
Training Accuracy


479it [00:10, 45.35it/s]
2it [00:00, 18.57it/s]

Evaluation accuracy: 0.8903966597077244
Epoch: 8
####################


479it [00:24, 19.40it/s]
5it [00:00, 43.59it/s]

total_loss:   0.28064770434225905
Training Accuracy


479it [00:10, 45.37it/s]
2it [00:00, 18.92it/s]

Evaluation accuracy: 0.8950939457202505
Epoch: 9
####################


479it [00:24, 19.37it/s]
4it [00:00, 39.32it/s]

total_loss:   0.2722041474255889
Training Accuracy


479it [00:11, 43.42it/s]
4it [00:00, 19.26it/s]

Evaluation accuracy: 0.8974425887265136
Epoch: 10
####################


479it [00:24, 19.35it/s]
5it [00:00, 43.59it/s]

total_loss:   0.26562123869629184
Training Accuracy


479it [00:10, 45.39it/s]
2it [00:00, 18.74it/s]

Evaluation accuracy: 0.9010960334029228
Epoch: 11
####################


479it [00:24, 19.32it/s]
5it [00:00, 42.49it/s]

total_loss:   0.25670159771186185
Training Accuracy


479it [00:10, 45.21it/s]

Evaluation accuracy: 0.9034446764091858





### Keyword extraction

In [9]:
import pickle
import numpy as np

with open('wv_dict_listed.pkl', 'rb') as fp:
    wv_dict = pickle.load(fp)

import torch
vectors = np.array(list(wv_dict.values()))
words = list(wv_dict.keys())
vectors_all = np.vstack([np.zeros(300), vectors])
vectors_all = torch.tensor(vectors_all)

from tqdm import tqdm

hojin_ids = list(set(sample_data.hojin_id))
hojin_ids = [int(i) for i in hojin_ids]

tokenizer = Tokenizer(words, max_len=864, data = sample_data)

web_vectors = [tokenizer.encode_webportfolio(company_id=idx, max_page=max_page) for idx in tqdm(hojin_ids)]

seq_ids = torch.tensor([i[1] for i in web_vectors])
num_pages = torch.tensor([i[0] for i in web_vectors])
seq_lengths = tokenizer.max_len - torch.sum(seq_ids == 0, axis=-1)
labels = torch.tensor([tokenizer.get_label(i) for i in hojin_ids])
hojin_ids = torch.tensor(hojin_ids)

100%|██████████| 3829/3829 [00:12<00:00, 317.94it/s]


#### An example

In [10]:
model.eval()
t = 568
probs, senti_scores, attn, page_attn, final_vec, page_score, _ = model(seq_ids[t:(t+1)].to(device), num_pages[t:(t+1)].to(device), seq_lengths[t:(t+1)].to(device))
id_to_token = tokenizer.id_to_token
id_to_token[0] = '#'


sents = []
for i in range(num_pages[t:(t+1)].tolist()[0]):
    sents.append(' '.join([id_to_token[w] for w in seq_ids[t:(t+1)][0][i].tolist()]))
    
df = pd.DataFrame({'url': list(sample_data[sample_data.hojin_id == int(hojin_ids[t].tolist())].urls), 
                   'weight': page_attn.view(-1)[:num_pages[t:(t+1)].tolist()[0]].tolist(),
                   'page_score': page_score.view(-1)[page_score.view(-1) > -9999].tolist()
                   })
print(df)

import numpy as np
import matplotlib
import matplotlib.pyplot as plt


def colorize(words, color_array):
    cmap=matplotlib.cm.Blues
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        #print(color)
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string


word_col = []
color_arrays = []
for i in list(df.index):
    
    sent = sents[i]
    attn1 = attn.squeeze()[i]
    
    words = sent.split()
    color_array = attn1.view(-1).tolist()
    
    word_col = word_col + [str(page_attn.view(-1).tolist()[i])]
    color_array = color_array + [1]
    
    word_col.extend(words)
    color_arrays.extend(color_array)
    
color_arrays = np.array(color_arrays)
s = colorize(word_col, color_arrays * 100)

with open('colorize_all.html', 'w', encoding="utf-8") as f:
    f.write(s)


                                                  url    weight  page_score
0                     http://www.hakuto.co.jp/irinfo/  0.000000    0.065175
1            http://www.hakuto.co.jp/irinfo/announce/  0.000000    0.082474
2                            http://www.g5-hakuto.jp/  0.072509    0.160616
3              http://www.hakuto.co.jp/profile/ethic/  0.000000    0.005596
4     http://www.hakuto.co.jp/news/2022/20221024.html  0.000000   -0.131699
5     http://www.hakuto.co.jp/news/2022/20221111.html  0.034292    0.122399
6         http://www.hakuto.co.jp/products/equipment/  0.244028    0.332135
7   http://www.hakuto.co.jp/profile/outline/strate...  0.000000    0.013428
8                    http://www.hakuto.co.jp/profile/  0.010578    0.098686
9        http://www.hakuto.co.jp/products/components/  0.000000    0.075061
10  http://www.hakuto.co.jp/profile/outline/embedd...  0.000000   -0.090604
11    http://www.hakuto.co.jp/news/2022/20221102.html  0.000000    0.021570
12    http:/

#### Batch extraction

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt


def colorize(words, color_array):
    cmap=matplotlib.cm.Blues
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        print(color)
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

def select_keywords(attn_w, words, n=10):
    combo = [(i, j) for i, j in zip(attn_w, words) if i != 0]
    attn_w = np.array([i[0] for i in combo])
    words = [i[1] for i in combo]
    attn_diff = attn_w.max() - attn_w
    attn_thres = np.percentile(attn_diff, n)
    selected_keywords = [i for i, j in zip(words, attn_diff) if j <= attn_thres]
    selected_keywords_show = [0.6 if j <= attn_thres else 0 for i, j in zip(words, attn_diff)]
    return selected_keywords, selected_keywords_show

url_col = []
text_col = []
sents_selected = []
weight_col = []
hojin_id_col = []
hightechflag_col = []
model.eval()

final_vecs = []
web_vecs = []
page_attns = []
urls = []


for t in range(len(hojin_ids)):
    
    probs, senti_scores, attn, page_attn, final_vec, page_score, web_vec = model(seq_ids[t:(t+1)].to(device), num_pages[t:(t+1)].to(device), seq_lengths[t:(t+1)].to(device))
    id_to_token = tokenizer.id_to_token
    id_to_token[0] = '#'


    sents = []
    for i in range(num_pages[t:(t+1)].tolist()[0]):
        sents.append(' '.join([id_to_token[w] for w in seq_ids[t:(t+1)][0][i].tolist()]))

    final_vecs.append(final_vec.detach().cpu().numpy())
    df = pd.DataFrame({'url': list(sample_data[sample_data.hojin_id == int(hojin_ids[t].tolist())].urls), 
                   'hojin_id': list(sample_data[sample_data.hojin_id == int(hojin_ids[t].tolist())].hojin_id),
                   'hightechflag': list(sample_data[sample_data.hojin_id == int(hojin_ids[t].tolist())].hightechflag),
                   'text':list(sample_data[sample_data.hojin_id == int(hojin_ids[t].tolist())].cleaned_content),             
                   'weight': page_attn.view(-1)[:num_pages[t:(t+1)].tolist()[0]].tolist(),
                   'page_score': page_score.view(-1)[page_score.view(-1) > -9999].tolist(),
                   #'web_vecs': list(web_vec[0])[:num_pages[t:(t+1)].tolist()[0]]
                   })
    df = df[df.weight > 0].reset_index()


    hojin_id_col.extend(df.hojin_id)
    hightechflag_col.extend(df.hightechflag)
    url_col.extend(df.url)
    text_col.extend(df.text)
    weight_col.extend(df.weight)
    
    for i in list(df['index']):
        sent = sents[i]
        attn1 = attn.squeeze()[i]
    
        words = sent.split()
        color_array = np.array(attn1.view(-1).tolist())
    
        selected_keywords, selected_keywords_show = select_keywords(color_array, words, n=20)

        sents_selected.append([j for j, k in zip(words, selected_keywords_show) if k != 0])
    

sents_selected = ['|'.join(i) for i in sents_selected]
selected_df = pd.DataFrame({
    'hojin_id': hojin_id_col, 
    'url': url_col,
    'weight': weight_col,
    'text':text_col,
    'sents': sents_selected, 'hightechflag': hightechflag_col,
    })

In [None]:
selected_df.head()