In [39]:
!pip install transformers
!pip install beautifulsoup4



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
# ブックマークサイトからデータを集める
import requests, datetime, json, time
from urllib.parse import urlparse
from bs4 import BeautifulSoup
def collect_entries(html):
    soup = BeautifulSoup(html)
    elems = soup.select(".entrylist-contents")
    entries = []
    for elem in elems:
        title = elem.select("a.js-keyboard-openable")[0].get("title")
        href = elem.select("a.js-keyboard-openable")[0].get("href")
        count = elem.select("a.js-keyboard-entry-page-openable span")[0].text
        categ = elem.select("li.entrylist-contents-category")[0].text
        desc = elem.select("p.entrylist-contents-description")[0].text
        domain = urlparse(href).netloc
        entries.append({"b_count": count, "title": title, "desc": desc, "domain": domain})
    return entries

def collect_hatebu_articles(genre="all", limit=1000):
    base_url = "https://b.hatena.ne.jp/hotentry/{}/".format(genre)
    now = int(time.time())
    now = now - 2 * 86400

    entries = []
    for i in range(limit):
        now = now - 86400
        dt = datetime.datetime.fromtimestamp(now)
        date_str = str(dt.strftime("%Y%m%d"))
        print(date_str)
        url = base_url + date_str
        res = requests.get(url)
        entries.extend(collect_entries(res.text))
    pd.DataFrame(entries).to_csv("all_hatebu_articles_{}.csv".format(genre), header=True, index=True)
    return entries

genres = ["all", "general", "social", "economics", "life", "knowledge", "it", "fun", "entertainment", "game"]
for genre in genres:
    collect_hatebu_articles(genre, limit=10)
    break

20210326
20210325
20210324
20210323
20210322
20210321
20210320
20210319
20210318
20210317


In [17]:
# ファイルからデータをロードする
import pandas as pd
import numpy  as np
data_arr = []
genres = ["general", "social", "economics", "life", "knowledge", "it", "fun", "entertainment", "game"]
# genres = ["all"]
for genre in genres:
    cnt = 0
    file_name = "drive/MyDrive/all_hatebu_articles_{}.csv".format(genre)
    df = pd.read_csv(file_name, index_col=0, header=0, encoding="utf-8")
    bulk_str = ""
    for row in df.iterrows():
        cnt += 1
        if row[1]['desc'] is np.nan or row[1]['title'] is np.nan:
            continue
        data_arr.append({"b_count": row[1]['b_count'], "date": row[1]['date'], "timestamp": row[1]['timestamp'], "domain":row[1]['domain'], "text": "{} {} {} {}".format(genre, row[1]['domain'], row[1]['title'], row[1]['desc'])})

df = pd.DataFrame(data_arr)
# ブコメ数
df['b_count_log'] = np.log(df['b_count'])


In [19]:
# 時系列データとして扱う。ブコメ数 > 10のフィルタは、ブコメ数が10に達したもののみを予測対象にする、という実運用を想定
X_test = df.loc[(df.date > 20210300)&(df.b_count > 10)].sample(frac=1.0, random_state=42)
X_valid = df.loc[(df.date < 20210300) & (df.date > 20210200)&(df.b_count > 10)].sample(frac=1.0, random_state=42)
X_train = df.loc[(df.date < 20210200)&(df.date > 20201000)&(df.b_count > 10)].sample(frac=1.0, random_state=42)
X_train.shape, X_valid.shape, X_test.shape

((34919, 6), (7954, 6), (6880, 6))

In [4]:
import os
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# OOM対策
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
        print('{} memory growth: {}'.format(device, tf.config.experimental.get_memory_growth(device)))
else:
    print("Not enough GPU hardware devices available")

max_seq_length = 256

bert_folder = "cl-tohoku/bert-base-japanese"
tokenizer = BertTokenizer.from_pretrained(bert_folder)

# Tokenizerで文章をトークン化する
def encode_sentence(s, tokenizer):
    s = str(s)
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

# input_ids, attention_mask, token_type_ids を作るやつ
def bert_encode(sentences, tokenizer):
    tokenized_sentences = tf.ragged.constant([
        encode_sentence(s, tokenizer)[:max_seq_length-1]
        for s in sentences])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*tokenized_sentences.shape[0]
    
    input_word_ids = tf.concat([cls, tokenized_sentences], axis=-1)
    attention_mask = tf.ones_like(input_word_ids).to_tensor()
    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(tokenized_sentences)
    token_type_ids = tf.concat(
        [type_cls, type_s1], axis=-1).to_tensor()

    inputs = {
        'input_ids': input_word_ids.to_tensor(),
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask}

    return inputs


PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') memory growth: True


In [20]:
# Inputのテキストをエンコードする。あとlabel作る。
train_dict = bert_encode(X_train['text'].values, tokenizer)
valid_dict = bert_encode(X_valid['text'].values, tokenizer)
test_dict  = bert_encode(X_test['text'].values,  tokenizer)

train_log_labels  = np.array(X_train['b_count_log'], dtype=np.float32)
valid_log_labels  = np.array(X_valid['b_count_log'], dtype=np.float32)
test_log_labels   = np.array(X_test['b_count_log'],  dtype=np.float32)

batch_size = 20
train_log_dataset_batched = tf.data.Dataset.from_tensor_slices((train_dict, train_log_labels)).shuffle(10000).batch(batch_size)
valid_log_dataset_batched = tf.data.Dataset.from_tensor_slices((valid_dict, valid_log_labels)).batch(batch_size)
test_log_dataset_batched  = tf.data.Dataset.from_tensor_slices((test_dict,  test_log_labels)).batch(batch_size)


In [6]:
# モデルを用意してTrainする
from transformers import TFBertForSequenceClassification
bert_folder = "cl-tohoku/bert-base-japanese"
reg_model = TFBertForSequenceClassification.from_pretrained(bert_folder, from_pt=True, num_labels=1)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.MeanSquaredError()
reg_model.compile(optimizer=optimizer, loss=loss)

hist = reg_model.fit(train_log_dataset_batched, validation_data=valid_log_dataset_batched, epochs=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Epoch 2/2


In [12]:
reg_model.save_weights("pretrained_weights.h5")

In [33]:
# test データでモデルの出来を確認
iterator = iter(test_log_dataset_batched)
test_batch = next(iterator)
res = reg_model.predict(test_batch)
num1 = batch_size * 0
num2 = num1 + batch_size
pd.DataFrame({"true" : np.round(np.exp(test_batch[1].numpy())).astype(int),
              "predict" : np.floor(np.exp(np.squeeze(res[0]))).astype(int),
              "text": X_test[["b_count", "text"]].iloc[num1:num2]["text"].values})


Unnamed: 0,true,predict,text
0,16,91,it uit-inside.linecorp.com ep.79 「フォントのなかの人」と見...
1,13,43,game news.denfaminicogamer.jp 発売中止となったホラーゲーム『還...
2,813,297,general www.ajimatics.com 「2乗してはじめて0になる数」とかあった...
3,123,168,life card-media.money.rakuten.co.jp ラクに＆ちょっといい...
4,18,42,"game twitter.com ㍃ﾊｶｾ on Twitter: ""それはマジでやめろ 「..."
5,304,74,game anond.hatelabo.jp ブスが活躍する漫画が見たい 最初非モテ(男)が...
6,83,213,general www.itmedia.co.jp アイリスオーヤマ初のノートPC登場　税別...
7,24,114,game togetter.com 「初心者には明確な答えを、上級者には多彩な選択肢を」ゲー...
8,823,409,general anond.hatelabo.jp 家賃保証会社の問題と解決策の検討 〜 天...
9,166,88,game anond.hatelabo.jp やっぱりウマ娘の記事がステマにしか見えん ht...


In [6]:
# 以下、既存のweightsを読み込むやつ
import tensorflow as tf
from transformers import TFBertForSequenceClassification
bert_folder = "cl-tohoku/bert-base-japanese"
reg_model = TFBertForSequenceClassification.from_pretrained(bert_folder, from_pt=True, num_labels=1)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.MeanSquaredError()
reg_model.compile(optimizer=optimizer, loss=loss)
reg_model.load_weights("drive/MyDrive/hatebu_regressor.h5")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
