In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import os

  from tqdm.autonotebook import tqdm, trange


In [1]:
from apihelper.dataloaders import get_finnish_data

df = get_finnish_data(split=False)
df["Description"] = df["Description"].apply(lambda x: " ".join(x.split(" ")[:50]))


In [2]:
df = pd.read_parquet(
    "../data_exploration/data/dwp_cleaned_tickets.parquet",
    columns=["Title", "Description", "DbName", "ObjectID"],
    engine="pyarrow",
)
df = df[df['DbName'].isin(['M42Production_211'])]
df = df[df["Title"].notna()]
df = df[df["Description"].notna()]
df = df[df["Title"].apply(lambda x: len(str(x).strip().lower()) > 1 and len(str(x).strip().lower()) != "test")]
df["Description"] = df["Description"].apply(lambda x: " ".join(x.split(" ")[:50]))
# full text
# df["FullText"] = df["Title"] + " " + df["Description"]
# del df["Title"],df["Description"]
len(df), df["DbName"].value_counts()

(271758,
 DbName
 M42Production_211    271758
 Name: count, dtype: int64)

### Custom Embeddings

In [2]:
data_dir = "./data/jina_embeddings"
os.makedirs(data_dir, exist_ok=True)
from transformers import AutoModel

# Initialize the model
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True).to('mps')

# When calling the `encode` function, you can choose a `task` based on the use case:
# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
# Alternatively, you can choose not to pass a `task`, and no specific LoRA adapter will be used.
foo = df.drop_duplicates("FullText")
del df
embeddings = model.encode(foo["FullText"].values.tolist(), task="classification",batch_size=8,verbose=True)
with open(f"{data_dir}/desc_embeddings.npy", "wb") as f:
    np.save(f, embeddings)
with open(f"{data_dir}/desc_texts.npy", "wb") as f:
    np.save(f, foo["FullText"].values)
del embeddings

### Sentence transformer embeddings

In [3]:
# sentence_encoder = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
# sentence_encoder = SentenceTransformer("intfloat/multilingual-e5-large-instruct")
# sentence_encoder = SentenceTransformer("dunzhang/stella_en_400M_v5",trust_remote_code=True)
sentence_encoder = SentenceTransformer(
    "dunzhang/stella_en_400M_v5",
    trust_remote_code=True,
    device="cpu",
    config_kwargs={"use_memory_efficient_attention": False, "unpad_inputs": False}
)
# sentence_encoder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
# sentence_encoder = SentenceTransformer("distilbert-base-multilingual-cased")
# sentence_encoder = SentenceTransformer("google-bert/bert-base-multilingual-cased")
# distilbert-base-multilingual-cased,paraphrase-multilingual-MiniLM-L12-v2,paraphrase-multilingual-mpnet-base-v2
data_dir = "./data/stella_en_400M_v5"
os.makedirs(data_dir, exist_ok=True)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
foo = df.drop_duplicates("Description")
desc_embeddings = sentence_encoder.encode(foo["Description"].values.tolist(), show_progress_bar=True, batch_size=16,normalize_embeddings=True)
with open(f"{data_dir}/desc_embeddings.npy", "wb") as f:
    np.save(f, desc_embeddings)
with open(f"{data_dir}/desc_texts.npy", "wb") as f:
    np.save(f, foo["Description"].values)
del desc_embeddings

Batches: 100%|██████████| 14308/14308 [4:15:52<00:00,  1.07s/it] 


In [5]:
foo = df.drop_duplicates("Title")
title_embeddings = sentence_encoder.encode(foo["Title"].values.tolist(), show_progress_bar=True, batch_size=16,normalize_embeddings=True)
with open(f"{data_dir}/title_embeddings.npy", "wb") as f:
    np.save(f, title_embeddings)
with open(f"{data_dir}/title_texts.npy", "wb") as f:
    np.save(f, foo["Title"].values)
del title_embeddings

Batches: 100%|██████████| 10614/10614 [59:21<00:00,  2.98it/s] 


### Ada3 Embeddings

In [3]:
%load_ext autoreload
%autoreload 2
from dotenv import load_dotenv

load_dotenv(override=True)
from apihelper.openai_embeddings import OpenAIAzureEmbeddings

# use redis to store the embeddings
import redis
import numpy as np
from tqdm import tqdm
import pandas as pd

df = pd.read_parquet(
    "../data_exploration/data/dwp_cleaned_tickets.parquet",
    columns=["Title", "Description", "DbName", "ObjectID"],
    engine="pyarrow",
)
df = df[df["Title"].notna()]
df = df[df["Description"].notna()]
df = df[df["Title"].apply(lambda x: len(str(x).strip().lower()) != "test")]
df = df[df["DbName"] == "M42Production_646"]
# df.sort_values(['ObjectID','DbName']).drop_duplicates(subset=['Description'],inplace=True,keep='first')
# assert len(df) == len(df.drop_duplicates(subset=['ObjectID','DbName']))

redis_conn = redis.Redis("localhost", 6379, db=4)
get_emb = OpenAIAzureEmbeddings()
# delete all keys
# redis_conn.flushdb()
len(df),len(redis_conn.keys())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


(119252, 0)

In [4]:
for string in tqdm(df["Title"].values, total=len(df)):
    if not string in redis_conn:
        emb = get_emb.get_embedding([string])
        redis_conn.set(string, emb[0].tobytes())

100%|██████████| 119252/119252 [4:34:09<00:00,  7.25it/s]  


In [5]:
df["Description"] = df["Description"].apply(lambda x: " ".join(x.split(" ")[:50]))

In [6]:
for string in tqdm(df["Description"].values, total=len(df)):
    if not string in redis_conn:
        emb = get_emb.get_embedding([string])
        redis_conn.set(string, emb[0].tobytes())

100%|██████████| 119252/119252 [4:47:20<00:00,  6.92it/s]  


In [7]:
recover = []
for string in tqdm(df["Description"].values, total=len(df)):
    emb = np.frombuffer(redis_conn.get(string), dtype=np.float32)
    recover.append((string, emb))
texts, embeddings = zip(*recover)

100%|██████████| 119252/119252 [00:03<00:00, 33592.93it/s]


In [8]:
data_dir = "./data/M42Production_646_ada3_small"
os.makedirs(data_dir, exist_ok=True)

with open(f"{data_dir}/desc_embeddings.npy", "wb") as f:
    np.save(f, np.array(embeddings))
with open(f"{data_dir}/desc_texts.npy", "wb") as f:
    np.save(f, np.array(texts))

In [9]:
recover = []
for string in tqdm(df["Title"].values, total=len(df)):
    emb = np.frombuffer(redis_conn.get(string), dtype=np.float32)
    recover.append((string, emb))
texts, embeddings = zip(*recover)

100%|██████████| 119252/119252 [00:03<00:00, 34410.74it/s]


In [10]:
with open(f"{data_dir}/title_embeddings.npy", "wb") as f:
    np.save(f, np.array(embeddings))
with open(f"{data_dir}/title_texts.npy", "wb") as f:
    np.save(f, np.array(texts))