In [1]:
#import 
import pandas as pd
import numpy as np
from googletrans import Translator, LANGUAGES
from sentence_transformers import SentenceTransformer
import sentencepiece as spm
import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# 利用したデータセット: https://www.kaggle.com/datasets/subinium/emojiimage-dataset/data

full_emoji = pd.read_csv("../Data/full_emoji.csv")
emojis_data = full_emoji.drop(columns=["Apple", "Google", "Facebook","Windows","Twitter","JoyPixels","Samsung","Gmail","SoftBank","DoCoMo","KDDI"], axis=1)

emojis_data.head()

Unnamed: 0,#,emoji,unicode,name
0,1,😀,U+1F600,grinning face
1,2,😃,U+1F603,grinning face with big eyes
2,3,😄,U+1F604,grinning face with smiling eyes
3,4,😁,U+1F601,beaming face with smiling eyes
4,5,😆,U+1F606,grinning squinting face


In [6]:
# 絵文字の名前を日本語に翻訳

translator = Translator()


def translate_to_japanese(text):
    try:
        translated_text = translator.translate(text, src='en', dest='ja')
        return translated_text.text
    except Exception as e:
        print("error:", e)
        return text

emojis_data['name_ja'] = emojis_data['name'].apply(translate_to_japanese)

emojis_data.head()


Unnamed: 0,#,emoji,unicode,name,name_ja
0,1,😀,U+1F600,grinning face,ニヤリとした顔
1,2,😃,U+1F603,grinning face with big eyes,大きな目で笑っている顔
2,3,😄,U+1F604,grinning face with smiling eyes,笑顔の目でニヤリとした顔
3,4,😁,U+1F601,beaming face with smiling eyes,笑顔の目で晴れやかな顔
4,5,😆,U+1F606,grinning squinting face,ニヤニヤと目を細めた顔


In [37]:
emojis_data.to_csv("../Data/emojis_data.csv")

In [7]:
# 日本語のテキスト埋込モデル(GLuCoSE)をロード
# https://huggingface.co/pkshatech/GLuCoSE-base-ja
model = SentenceTransformer('pkshatech/GLuCoSE-base-ja')
# 絵文字の名前を埋め込み
emoji_embeddings = model.encode([ed for ed in emojis_data["name_ja"]])




In [46]:
# テキストの入力
input_word = str(input("検索: "))
input_embedding = model.encode([input_word])

emoji_embeddings_norm = emoji_embeddings / np.linalg.norm(emoji_embeddings, axis=1, keepdims=True)
input_embedding_norm = input_embedding / np.linalg.norm(input_embedding)

# cosine類似度を利用
cosine_similarities = np.dot(emoji_embeddings_norm, input_embedding_norm.T)

closest_emoji_indices = np.argsort(-cosine_similarities, axis=0)[:5].flatten()

closest_emojis = emojis_data.iloc[closest_emoji_indices]

# 結果を出力
print(f"入力単語「{input_word}」に近い絵文字:")
for index, row in closest_emojis.iterrows():
    emoji_row = emojis_data.iloc[index]
    print(f"{row['emoji']} :{emoji_row['name']}: (類似度: {cosine_similarities[index][0]:.3f})")

入力単語「辞書」に近い絵文字:
📝 :memo: (類似度: 0.661)
📓 :notebook: (類似度: 0.649)
🗒 :spiral notepad: (類似度: 0.592)
📇 :card index: (類似度: 0.576)
🔖 :bookmark: (類似度: 0.566)
