In [11]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import os
import ast 

In [3]:
from datasets import load_dataset

ds = load_dataset("json", data_files="v1.0-simplified_simplified-nq-train.jsonl/simplified-nq-train.jsonl", split="train")
ds = ds.select(range(1000))

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/31 [00:00<?, ?it/s]

In [None]:
chunks = pd.read_json("v1.0-simplified_simplified-nq-train.jsonl/simplified-nq-train.jsonl", lines=True, chunksize=1000)
# use the first chunk for experiment
df = next(chunks).reset_index(drop=True)

In [2]:
# parse the html and remove the html tag
def clear_tag(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ", strip=True)

# documents = df['document_text'].apply(clear_tag)
documents = pd.read_csv("./data/documents.csv")['document_text']

In [3]:
# seperate the text into chunk
def split_into_chunks(text, chunk_size=100):
    words = re.findall(r'\w+', text)
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks
chunk_lists = documents.apply(split_into_chunks) 
all_chunks = [chunk for doc_chunks in chunk_lists for chunk in doc_chunks]

In [12]:
# # embed the chunk
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("facebook/contriever")
model = AutoModel.from_pretrained("facebook/contriever").to(device)
model.eval()

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(dim=1) / input_mask_expanded.sum(dim=1)


In [5]:
def process_chunks_to_single_csv(all_chunks, batch_size=50, output_csv="contriever_embeddings.csv"):
    first_write = True  

    for i in tqdm(range(0, len(all_chunks), batch_size), desc="Embedding chunks"):
        batch_chunks = all_chunks[i:i+batch_size]

        # tokenizer
        inputs = tokenizer(batch_chunks, padding=True, truncation=True, return_tensors='pt', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # forward
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = mean_pooling(outputs, inputs["attention_mask"]).cpu().tolist()

        # 加入 DataFrame
        df = pd.DataFrame({
            "chunk": batch_chunks,
            "embedding": embeddings
        })

        df.to_csv(output_csv, mode='a', header=first_write, index=False)
        first_write = False 


In [6]:
process_chunks_to_single_csv(all_chunks[:1000], batch_size=50, output_csv="contriever_embeddings.csv")

Embedding chunks: 100%|██████████| 20/20 [00:06<00:00,  3.28it/s]


In [9]:
import numpy as np
df = pd.read_csv("contriever_embeddings.csv")
df['embedding'] = df['embedding'].apply(ast.literal_eval)  
embedding_matrix = np.array(df['embedding'].tolist()).astype('float32')  # shape: (N, 768)

In [12]:
import faiss

faiss.normalize_L2(embedding_matrix)

index = faiss.IndexFlatIP(embedding_matrix.shape[1])  # IP = Inner Product
index.add(embedding_matrix)


In [14]:
# save the faiss
faiss.write_index(index, "contriever_index.faiss")

In [16]:
query = ds['question_text'][0]
query

'which is the most common use of opt-in e-mail marketing'

In [13]:
# read the saved faiss
import faiss

index = faiss.read_index("contriever_index.faiss")

In [17]:
inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
with torch.no_grad():
    outputs = model(**inputs)
query_embedding = mean_pooling(outputs, inputs['attention_mask'])

# normalize for cosine similarity
query_embedding = query_embedding.cpu().numpy()
faiss.normalize_L2(query_embedding)

D, I = index.search(query_embedding, k=5)

In [None]:
# find the top-5
df = pd.read_csv("contriever_embeddings.csv")
df['embedding'] = df['embedding'].apply(ast.literal_eval)

for idx in I[0]:
    # chunk 14 is answer    
    print(idx, ':', df['chunk'].iloc[idx])
    print()


14 : marketing is a method of advertising via email whereby the recipient of the advertisement has consented to receive it This method is one of several developed by marketers to eliminate the disadvantages of email marketing Opt in email marketing may evolve into a technology that uses a handshake protocol between the sender and receiver This system is intended to eventually result in a high degree of satisfaction between consumers and marketers If opt in email advertising is used the material that is emailed to consumers will be anticipated It is assumed that the recipient wants to receive it which makes

11 : behavior of the recipients The insights provided by consumer response to email marketing help businesses and organizations understand and make use of consumer behavior Email provides a cost effective method to test different marketing content including visual creative marketing copy and multimedia assets The data gathered by testing in the email channel can then be used across 