# Embedding Preprocess


In [1]:
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/MyDrive/RecSys2024/
base_path = '/content/drive/MyDrive/RecSys2024/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Exp_Recsys2024_LGBM_train.ipynb  output				     Recsys2024_ensemble.ipynb
Exp_Recsys2024_preprocess.ipynb  preprocess2_embed_similarity.ipynb  Recsys2024_LGBM_test.ipynb
feature_output			 preprocess2.ipynb		     Recsys2024_LGBM_train.ipynb
input				 preprocess_article.ipynb	     Recsys2024_preprocess.ipynb
models				 preprocess_create_embed.ipynb
old_Recsys2024_LGBM_train.ipynb  Recsys2024_EDA.ipynb


In [2]:
import pandas as pd
import polars as pl
import numpy as np
import pickle
import gc
from sklearn.decomposition import PCA

In [3]:
out_path = base_path + 'feature_output'
input_path = base_path + 'input/'
#yyama_flag = False
yyama_flag = True
#dim = 32

dim = 16
#dim = 8
#dim = 4
#dim = 2

In [4]:
# yyama embeddings
if yyama_flag:
    N = 3
    embed_path_list = [
        'recsys24_share/article_subtitle_embeddings_distiluse-base-multilingual-cased-v2.parquet',
        'recsys24_share/article_title_embeddings_distiluse-base-multilingual-cased-v2.parquet',
        'recsys24_share/article_title_subtitle_embeddings_distiluse-base-multilingual-cased-v2.parquet',
    ]
else:
    # official embeddings
    N = 5
    embed_path_list = [
        'Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet',
        'Ekstra_Bladet_image_embeddings/image_embeddings.parquet',
        'Ekstra_Bladet_word2vec/document_vector.parquet',
        'FacebookAI_xlm_roberta_base/xlm_roberta_base.parquet',
        'google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet',
    ]
embed_path_list = [input_path + i for i in embed_path_list]

In [5]:
embed_path_list

['/content/drive/MyDrive/RecSys2024/input/recsys24_share/article_subtitle_embeddings_distiluse-base-multilingual-cased-v2.parquet',
 '/content/drive/MyDrive/RecSys2024/input/recsys24_share/article_title_embeddings_distiluse-base-multilingual-cased-v2.parquet',
 '/content/drive/MyDrive/RecSys2024/input/recsys24_share/article_title_subtitle_embeddings_distiluse-base-multilingual-cased-v2.parquet']

In [6]:
df_list = [pl.read_parquet(i) for i in embed_path_list]

# yyama embeddings
if yyama_flag:
    df_list[0] = df_list[0].rename({'document_vector': 'sub_vector'})
    df_list[1] = df_list[1].rename({'document_vector': 'title_vector'})
    df_list[2] = df_list[2].rename({'document_vector': 'title_sub_vector'})
    col_name_list = ['sub_vector', 'title_vector', 'title_sub_vector']
else:
    col_name_list = ['contrastive_vector', 'image_embedding', 'document_vector', 'FacebookAI/xlm-roberta-base', 'google-bert/bert-base-multilingual-cased']
for i, name in enumerate(col_name_list):
    print(i, name)
    print(int(df_list[i][name].list.len().mean()))

0 sub_vector
512
1 title_vector
512
2 title_sub_vector
512


In [7]:
def apply_pca_to_row(row, pca):
    array = np.array(row).reshape(1, -1)  # 1行の行列に変換
    reduced = pca.fit_transform(array)  # PCAを適用
    return reduced.flatten().tolist()  # 1次元のリストに変換して返す

for i, name in enumerate(col_name_list):
    print(i, name)

    # 各データフレームのリストを行列に変換
    array_list = np.vstack(df_list[i][name].to_list())

    # PCAの実行
    pca = PCA(n_components=dim)
    reduced_array = pca.fit_transform(array_list)
    reduced_lists = reduced_array.tolist()
    df_list[i] = df_list[i].with_columns(pl.Series(f'{name}_pca{dim}', reduced_lists, dtype=pl.List(pl.Float32)))
    df_list[i] = df_list[i].drop(name)

0 sub_vector
1 title_vector
2 title_sub_vector


In [8]:
df = df_list[0].join(df_list[1], on='article_id', how='left')
df = df.join(df_list[2], on='article_id', how='left')
if not yyama_flag:
    df = df.join(df_list[3], on='article_id', how='left')
    df = df.join(df_list[4], on='article_id', how='left')
del df_list
gc.collect()

0

In [9]:
df

article_id,sub_vector_pca16,title_vector_pca16,title_sub_vector_pca16
i32,list[f32],list[f32],list[f32]
3000022,"[-0.108365, 0.123932, … -0.013907]","[-0.136341, -0.090135, … -0.047948]","[0.19828, 0.0197, … -0.065229]"
3000063,"[0.057012, 0.030963, … 0.094973]","[-0.181426, -0.175537, … -0.089643]","[-0.046479, -0.044283, … 0.061443]"
3000613,"[-0.125592, -0.15548, … 0.005403]","[-0.078775, -0.070515, … -0.068617]","[-0.065991, -0.156383, … -0.148105]"
3000700,"[-0.057773, 0.034485, … -0.050501]","[-0.170705, -0.036604, … -0.104798]","[-0.030511, 0.236575, … 0.010521]"
3000840,"[-0.13284, 0.053988, … 0.142838]","[-0.137563, -0.171242, … -0.039644]","[0.095699, -0.014137, … -0.11007]"
3001278,"[-0.089386, -0.180812, … 0.073659]","[-0.059936, -0.044763, … 0.089978]","[-0.165859, -0.096803, … 0.060559]"
3001299,"[-0.109626, -0.041902, … -0.039331]","[-0.165775, -0.135221, … 0.016708]","[0.008296, -0.078844, … 0.098792]"
3001353,"[-0.082099, 0.242142, … 0.063743]","[-0.184137, 0.152754, … 0.05981]","[0.201102, 0.087, … 0.106815]"
3001457,"[-0.101444, -0.164577, … -0.059862]","[0.030568, 0.017315, … 0.052029]","[-0.086248, -0.192523, … -0.010905]"
3001459,"[-0.096603, 0.112267, … -0.033047]","[-0.185588, -0.083424, … -0.076919]","[0.103519, -0.072743, … 0.001674]"


# Save

In [10]:
if yyama_flag:
    df.write_parquet(f"{out_path}/yyama_embed_pca{dim}.parquet")
else:
    df.write_parquet(f"{out_path}/embed_pca{dim}.parquet")

# Similarity Calculation