# Project: Semantic search with sentence embedding
Search OneNote page and paragraph best matching a given query

In [1]:
%reset -f
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import torch

from sem_search_pdf.utils.embedding_generator import generate_embeddings_for_dataframe
from sem_search_pdf.utils.pdf_reader import extract_information



# Load data

In [2]:
file_path = 'data/test.pdf'
file_name = 'test.pdf'
tmp = extract_information(file_path, file_name)

# Create embeddings

Instructions: https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search

### Preprocessing

In [3]:
model_ckpt = 'Alibaba-NLP/gte-multilingual-base'  # "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)

device = torch.device("cuda")
model.to(device)

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'
Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification 

NewModel(
  (embeddings): NewEmbeddings(
    (word_embeddings): Embedding(250048, 768, padding_idx=1)
    (rotary_emb): NTKScalingRotaryEmbedding()
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): NewEncoder(
    (layer): ModuleList(
      (0-11): 12 x NewLayer(
        (attention): NewAttention(
          (qkv_proj): Linear(in_features=768, out_features=2304, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (mlp): NewGatedMLP(
          (up_gate_proj): Linear(in_features=768, out_features=6144, bias=False)
          (down_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act_fn): GELUActivation()
          (hidden_dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_ln): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [4]:
tt = generate_embeddings_for_dataframe(tmp, tokenizer, model, device)

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/262 [00:00<?, ? examples/s]

In [5]:
# !pip install faiss-cpu
tt.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['page_in_on', 'title', 'page_in_pdf', 'paragraph', 'text', 'file_name', 'data', 'embeddings'],
    num_rows: 262
})

In [10]:
from sem_search_pdf.utils.embedding_generator import compute_embeddings
question = "How to use prompt to create a wiki"
question_embedding = compute_embeddings([question], tokenizer, model, device).cpu().detach().numpy()

In [11]:
scores, samples = tt.get_nearest_examples(
    "embeddings", question_embedding, k=5
)
import pandas as pd
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=True, inplace=True)

In [12]:
for _, row in samples_df.iterrows():
    print(f"PAGE: {row.page_in_on}")
    print(f"PAGE IN PDF: {row.page_in_pdf}")
    print(f"PARAGRAGH IN PDF: {row.paragraph}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"TEXT: {row.text}")
    print("=" * 50)
    print()

PAGE: 9
PAGE IN PDF: 19
PARAGRAGH IN PDF: 4
SCORE: 292.341796875
TITLE: Prompt Engineering
TEXT:  We will continue this iterative process with me 
providing additional information to you and you updating 
the prompt in the Revised prompt section until it's 
complete.
A Chinse 
version
我想请你xxxxx，请问我应该如何向你提问才能得到最满意的答
案，请提供全面、详细的建议，针对每一个建议请你提供具体
的提问范例，注意这些范例都是关于如何向你提问xxxxx，最后
根据你所有的建议，再综合提供一个总的提问范例，注意这个
范例必须同时体现你所有的建议。
Another 
English version 
in case of 
misunderstandi
ng 
Rephrase and expand the question, and respond
Wiki 获取信息,学习领
域,术语等
I want you to act as a Wikipedia page. I will give you the 
name of a topic, and you will provide a summary of that 
topic in the format of a Wikipedia page. Your summary 
should be informative and factual, covering the most 
important aspects of the topic. Start your summary with an 
introductory paragraph that gives an overview of the topic. 
My first topic is “The Great Barrier Reef.”
General 定义角色和任务


PAGE: 9
PAGE IN PDF: 19
PARAGRAGH IN PDF: 1
SCOR

In [149]:
# for i in range(9, 18):
#     # print(df[(df.page_in_pdf == 17) & (df.paragraph == i)].text.values[0], '-----\n')

预训练阶段，模型使用和 GPT-3 相同的数据集进行无监督学习，学习语言的基本知识和规律。 -----

微调阶段，模型使用一些人工标注的数据进行强化学习，学习如何根据指令生成合适的输出。
收集人类反馈:使用初始化模型生成多个不同摘要人工进行排序，得到一批排好序的摘要样本;
人工标注的数据包括两部分：指令和反馈。指令是一些用自然语言描述的任务，如 “写一首关于春天的诗” 或 “给我一个关于狗的笑话”。反馈是一些用数字表示的评分，如 “1” 表示很差，
“5” 表示很好。反馈是由人类标注者根据模型的输出给出的，反映了模型输出的质量和合理性。
1)
 -----

训练奖励模型:使用第1步得到的样本集，训练模型.该模型输入为一篇文章和对应的一个摘要，模型输出为该摘要的得分ii.
训练策略模型:使用初始化的策略模型生成一篇文章的摘要，然后使用奖励模型对该摘要打分，再使用打分值借助PPO算法重新优化策略模型;iii.
 -----

(在微调阶段，模型使用一个叫做 Actor-Critic 的算法进行强化学习。Actor-Critic 算法包括两个部分：Actor 和 Critic。Actor 是一个生成器，它根据指令生成输出。Critic 是一个评估器，它根据反馈评估
输出的奖励值。Actor 和 Critic 之间相互协作和竞争，不断地更新自己的参数，以提高奖励值。)
 -----

ii.
技术细节iii.
Step The Supervised Fine-Tuning (SFT) policyThe reward model (RM) Reinforcement Learning
Goal Collect demonstration data to train the 
SFT model to learn the "proper" 
response to a prompt
Learn an objective function directly from the data. Build an automatic 
system to mimic human preferences.
The purpose of this function is togive a score to the SFT model outputs 
to reflect human p

# Llama 2
Refer to https://huggingface.co/docs/transformers/tasks/language_modeling

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Pipeline

In [None]:
# Not enough memory on my Legion Y9000P. Need to set pagefile to system managed
from transformers import pipeline
checkpoint = 'meta-llama/Llama-2-7b-chat-hf'
generator = pipeline("text-generation", model=checkpoint, device_map='auto')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
prompt = ['How much memory is needed to run Llama2 ']
%time response = generator(prompt, max_new_tokens=50, num_beams=2, do_sample=True, top_k=5, top_p=0.95)

CPU times: total: 3min 4s
Wall time: 8min 26s


[[{'generated_text': "How much memory is needed to run Llama2 \n\nAnswer: Llama2 is a relatively lightweight library, and it doesn't require a lot of memory to run. In fact, Llama2 is designed Limited spin faut pap Fürulen mű efectспе Welcome以 politique domin"}]]

## With config

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(checkpoint)

NameError: name 'checkpoint' is not defined

In [None]:
inputs = tokenizer(raw_inputs, 
                   # padding='longest', truncation=True, max_length=128, 
                   return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=20, num_beams=2, do_sample=True, top_k=5, top_p=0.95)

In [None]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["Hello. Who are you?\n\nComment: Hello! I'm just an AI designed to assist and communicate with users"]

## Code Llama

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
checkpoint = 'codellama/CodeLlama-7b-Python-hf'
# generator = pipeline("text-generation", model=checkpoint, device_map='auto')


model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    # load_in_8bit=True,
    # torch_dtype=torch.float16,
    offload_folder="./save_folder",  # Need to create this folder anyway
    device_map="auto",
    # device_map={"": 0},  # not enough GPU memory
)

tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"

prompt = 'Write a piece of Python code to order a list of numbers'
inputs = tokenizer(prompt, 
                   # padding='longest', truncation=True, max_length=128, 
                   return_tensors="pt"
                  )

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**inputs, max_new_tokens=100)[0], skip_special_tokens=True))

In [None]:
prompt = ['Write a piece of Python code to order a list of numbers']
%time generator(prompt, max_new_tokens=50, num_beams=2, do_sample=True, top_k=5, top_p=0.95)