In [None]:
%pip install sentence-transformers tqdm pyarrow

In [None]:
import warnings
warnings.filterwarnings("ignore")

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from transformers import BertTokenizerFast, TFBertModel, BertConfig
from sentence_transformers import util
from tqdm import tqdm

import tensorflow as tf
import pyarrow as pa
import pandas as pd
import numpy as np

In [None]:
BASE = "hfl/chinese-macbert-base"
MODEL = "/content/drive/MyDrive/Colab Notebooks/bert_model_base.h5"
tokenizer = BertTokenizerFast.from_pretrained(BASE)
model = TFBertModel.from_pretrained(MODEL, config=BertConfig.from_pretrained(BASE))

Downloading (…)okenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

Some layers of TFBertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/bert_model_base.h5 and are newly initialized: ['bert']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
parallelism = 4

spark = SparkSession.builder \
                    .appName('Roberta Sentence Embedding') \
                    .config("spark.dynamicAllocation.enabled", False) \
                    .config("spark.driver.memory", "4g") \
                    .config("spark.cores.max", parallelism) \
                    .config("spark.executor.instances", parallelism) \
                    .config("spark.executor.cores", 1) \
                    .config("spark.executor.memory", "8g") \
                    .enableHiveSupport() \
                    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [5]:
item_names_df = spark.read.table("ruten.items").toPandas()
item_names_df

Unnamed: 0,item_id,item_name
0,11090615440439,收藏類vcd 日本明星寫真收藏 美少女的單人床 小野今日子 珍藏片 ar 077 僅一片
1,21109047110994,製作所 空仰雲 山本和枝原畫 rpg
2,21205215652012,宇慶s舖 台灣精品 aistrong專業工具 pmr 125 迷你無牙尖嘴鉗
3,21207237358651,30w 外熱式 刀頭 烙鐵頭 電焊頭
4,21210079138903,7 11超商ibon turbobit 升級碼 1個月 290元 高級premium會員 代...
...,...,...
539995,22139341097135,crucial m500 2 5 ssd 240gb sata 6gb s ct240m50...
539996,22141497553733,kk ptcg 寶可夢 中文 080 070 亞莎 sr 全圖
539997,22141569064369,mu s 同人誌代購 五十六 ambs 俺私僕私粘膜接触大交謎時空 咒術迴戰
539998,22148309974234,艾露 代購 日版 預購22年2月 名偵探柯南 q版壓克力立牌 貓咪ver 分售


In [None]:
batch_size = 1000
item_names = item_names_df.item_name.tolist()
num_batches = int(np.ceil( len(item_names) / batch_size ))
batches = [item_names[i * batch_size : (i + 1) * batch_size] for i in range(num_batches)]

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = tf.cast(tf.expand_dims(attention_mask, -1), tf.float32)
    sum_embeddings = tf.reduce_sum(token_embeddings * input_mask_expanded, axis=1)
    sum_mask = tf.math.reduce_sum(input_mask_expanded, axis=1)
    sentence_embeddings = sum_embeddings / tf.math.maximum(sum_mask, 1e-9)
    return tf.math.l2_normalize(sentence_embeddings, axis=1)
    
embeddings = []

with tqdm(total=num_batches) as pbar:
    for batch in batches:
        encoded_input = tokenizer(
            batch,
            max_length=128, 
            padding=True,
            truncation=True, 
            return_tensors="tf"
        )
        model_output = model(**encoded_input)
        embeddings.extend(mean_pooling(model_output, encoded_input['attention_mask']).numpy())
        pbar.update(1)

item_names_df['embedding'] = embeddings
item_names_df

100%|██████████| 540/540 [09:39<00:00,  1.07s/it]


Unnamed: 0,item_id,item_name,embedding
0,11090615440439,收藏類vcd 日本明星寫真收藏 美少女的單人床 小野今日子 珍藏片 ar 077 僅一片,"[-0.020382637, -0.022923939, 0.022253165, 0.00..."
1,21109047110994,製作所 空仰雲 山本和枝原畫 rpg,"[-0.014215255, 0.002541248, 0.023496788, -0.00..."
2,21205215652012,宇慶s舖 台灣精品 aistrong專業工具 pmr 125 迷你無牙尖嘴鉗,"[-0.0027007463, 0.00042553843, 0.019098254, 0...."
3,21207237358651,30w 外熱式 刀頭 烙鐵頭 電焊頭,"[-0.004207788, 0.006130133, 0.020490682, 0.009..."
4,21210079138903,7 11超商ibon turbobit 升級碼 1個月 290元 高級premium會員 代...,"[-0.013036992, 0.0012372807, 0.023598712, 0.00..."
...,...,...,...
539995,22139341097135,crucial m500 2 5 ssd 240gb sata 6gb s ct240m50...,"[-0.008894075, 0.01466513, 0.005143025, -0.008..."
539996,22141497553733,kk ptcg 寶可夢 中文 080 070 亞莎 sr 全圖,"[-0.011072577, 0.0028415245, -0.0059396243, 0...."
539997,22141569064369,mu s 同人誌代購 五十六 ambs 俺私僕私粘膜接触大交謎時空 咒術迴戰,"[-0.012061499, -0.0151356915, 0.00684122, -0.0..."
539998,22148309974234,艾露 代購 日版 預購22年2月 名偵探柯南 q版壓克力立牌 貓咪ver 分售,"[-0.014642366, -0.011144949, 0.01542837, -0.00..."


In [None]:
item_names_df[['item_id', 'embedding']].to_parquet("/tmp/bert_embeddings.parquet", engine='pyarrow', compression='snappy')

In [None]:
!hdfs dfs -copyFromLocal /tmp/roberta_embeddings.parquet "/ruten/bert_embeddings.parquet"