# Make article embeddings using Hugging Face

In [1]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
from tqdm import tqdm
import polars as pl
import numpy as np
import torch

from ebrec.utils._nlp import generate_embeddings_with_transformers
from ebrec.utils._python import batch_items_generator
from ebrec.utils._polars import concat_str_columns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-large"

Path for loading data:

In [3]:
DATA_PATH = Path("/Users/zhouchuanqi/Desktop/ebnerd_data/ebnerd_small").expanduser()
DUMP_DIR = DATA_PATH.joinpath("artifacts", TRANSFORMER_MODEL_NAME.replace("/", "_"))
DUMP_DIR.mkdir(parents=True, exist_ok=True)
print(f"Embeddings will be stored at: {DUMP_DIR}")

Embeddings will be stored at: /Users/zhouchuanqi/Desktop/ebnerd_data/ebnerd_small/artifacts/FacebookAI_xlm-roberta-large


In [4]:
df_articles = pl.read_parquet(DATA_PATH.joinpath("articles.parquet"))
df_articles.head(5)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3037230,"""Ishockey-spill…","""ISHOCKEY: Isho…",2023-06-29 06:20:57,False,"""Ambitionerne o…",2003-08-28 08:55:00,,"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Kendt"", … ""Mindre ulykke""]",142,"[327, 334]","""sport""",,,,0.9752,"""Negative"""
3044020,"""Prins Harry tv…","""Hoffet tvang P…",2023-06-29 06:21:16,False,"""Den britiske t…",2005-06-29 08:47:00,"[3097307, 3097197, 3104927]","""article_defaul…","""https://ekstra…","[""Harry"", ""James Hewitt""]","[""PER"", ""PER""]","[""Kriminalitet"", ""Kendt"", … ""Personfarlig kriminalitet""]",414,[432],"""underholdning""",,,,0.7084,"""Negative"""
3057622,"""Rådden kørsel …","""Kan ikke straf…",2023-06-29 06:21:24,False,"""Slingrende spr…",2005-10-10 07:20:00,[3047102],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Transportmiddel"", ""Bil""]",118,[133],"""nyheder""",,,,0.9236,"""Negative"""
3073151,"""Mærsk-arvinger…","""FANGET I FLODB…",2023-06-29 06:21:38,False,"""To oldebørn af…",2005-01-04 06:59:00,"[3067474, 3067478, 3153705]","""article_defaul…","""https://ekstra…",[],[],"[""Erhverv"", ""Privat virksomhed"", … ""Rejse""]",118,[133],"""nyheder""",,,,0.9945,"""Negative"""
3193383,"""Skød svigersøn…","""44-årig kvinde…",2023-06-29 06:22:57,False,"""En 44-årig mor…",2003-09-15 15:30:00,,"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9966,"""Negative"""


We're just going to demo, set to False to run all articles.

In [5]:
DEMO = True # orignal_text True
if DEMO:
    df_articles = df_articles[:10]

In [6]:
# 修改部分
df_articles = df_articles.with_columns(
    df_articles['published_time'].dt.strftime('%Y-%m-%d %H:%M:%S')
)
concat_columns = ["title", "subtitle", "body", "published_time"]

# concat_columns = ["title", "subtitle", "body"]

Select the Transformer model and the batch-size of which it will iterate the articles

In [7]:
model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)



Make the column with text data you want to embed:

In [8]:
df_articles, col_name = concat_str_columns(df_articles, concat_columns)

df_articles.select(col_name).head(5)

title-subtitle-body-published_time
str
"""Ishockey-spill…"
"""Prins Harry tv…"
"""Rådden kørsel …"
"""Mærsk-arvinger…"
"""Skød svigersøn…"


Embed text:

In [9]:
BATCH_SIZE = 32
n_batches = int(np.ceil(df_articles.height / BATCH_SIZE))

chunked_text_list = batch_items_generator(df_articles[col_name].to_list(), BATCH_SIZE)
embeddings = (
    generate_embeddings_with_transformers(
        model=model,
        tokenizer=tokenizer,
        text_list=text_list,
        batch_size=BATCH_SIZE,
        disable_tqdm=True,
    )
    for text_list in tqdm(
        chunked_text_list, desc="Encoding", total=n_batches, unit="text"
    )
)
embeddings = torch.vstack(list(embeddings))

Encoding: 100%|██████████| 1/1 [00:05<00:00,  5.12s/text]


Make the dataframe

In [10]:
# embeddings_name = f"{col_name}-{TRANSFORMER_MODEL_NAME}"
embeddings_name = "article_embeddings"
series_emb = pl.Series(embeddings_name, embeddings.to("cpu").numpy())
df_emb = df_articles.select("article_id").with_columns(series_emb)

Dump the embeddings:

In [11]:
file_path = DUMP_DIR.joinpath(f"{embeddings_name.replace('/', '_')}.parquet")
df_emb.write_parquet(file_path)
print(f"Embeddings saved to: {file_path}")

Embeddings saved to: /Users/zhouchuanqi/Desktop/ebnerd_data/ebnerd_small/artifacts/FacebookAI_xlm-roberta-large/article_embeddings.parquet


In [12]:
import polars as pl

# 加载 Parquet 文件
df = pl.read_parquet("/Users/zhouchuanqi/Desktop/ebnerd_data/ebnerd_small/artifacts/FacebookAI_xlm-roberta-large/title-subtitle-body-formatted_time-FacebookAI_xlm-roberta-large.parquet")

# 查看 DataFrame 的前几行
print(df.head())

# 查看 DataFrame 的基本信息（列名、数据类型等）
print(df.schema)

shape: (5, 2)
┌────────────┬───────────────────────────────────┐
│ article_id ┆ title-subtitle-body-formatted_ti… │
│ ---        ┆ ---                               │
│ i32        ┆ list[f32]                         │
╞════════════╪═══════════════════════════════════╡
│ 3037230    ┆ [0.116953, 0.019421, … -0.248299… │
│ 3044020    ┆ [0.161663, 0.040329, … -0.24165]  │
│ 3057622    ┆ [0.077545, 0.048769, … -0.305967… │
│ 3073151    ┆ [0.060434, 0.005382, … -0.251524… │
│ 3193383    ┆ [0.111904, 0.0116, … -0.244351]   │
└────────────┴───────────────────────────────────┘
OrderedDict([('article_id', Int32), ('title-subtitle-body-formatted_time-FacebookAI/xlm-roberta-large', List(Float32))])


# DONE 🚀