In [11]:
#!pip install accelerate transformers sentence-transformers faiss-cpu -qq

!mkdir readmes
!wget https://huggingface.co/datasets/pedrogengo/readmes/raw/main/transformer.md -O readmes/transformer.md
!wget https://huggingface.co/datasets/pedrogengo/readmes/raw/main/pymatting.md -O readmes/pymatting.md
!wget https://huggingface.co/datasets/pedrogengo/readmes/raw/main/yolo.md -O readmes/yolo.md

## Imports

In [12]:
import os
import markdown
import faiss
import torch
from bs4 import BeautifulSoup
from huggingface_hub import login
from sentence_transformers import SentenceTransformer

In [13]:
key_path = '/Users/jaesolshin/key/HF_TOKEN.txt'
os.environ["HF_TOKEN"] = open(key_path, 'r', encoding='utf-8').read()
login(os.environ["HF_TOKEN"])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/jaesolshin/.cache/huggingface/token
Login successful


In [14]:
device = torch.device("mps" if torch.backends.mps.is_available else "cpu")

## Splitting Texts

In [15]:
def md_to_text(md):
    html = markdown.markdown(md)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()

def split_text(document_path, chunk_size, overlap):
  with open(document_path, 'r', encoding='utf-8') as file:
        content = file.read()

  content = md_to_text(content)
  chunks = []
  for i in range(0, len(content), chunk_size):
    chunk = document_path + content[max(0, i-overlap):min(len(content), i+chunk_size)] #pymatting.md content[0:100]. -> pymatting.md content[100:200] ...
    chunks.append(chunk)
  return chunks

In [16]:
readmes_dir = "readmes"
chunk_size = 700
overlap = 100

all_chunks = []
for f in os.listdir("corrected_text"):
  if f[-2:] == "md":
    all_chunks += split_text(os.path.join(readmes_dir, f), chunk_size, overlap)

In [17]:
len(all_chunks)

130

## Creating the embeddings

In [18]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
_ = embedding_model.to(device)
embeddings = embedding_model.encode(all_chunks)
embeddings.shape



(130, 384)

## Indexing embeddings using FAISS

In [19]:
d = embeddings.shape[1] # 384
index = faiss.IndexFlatL2(d)
index.add(embeddings)

## Downloading Gemma 2B instruction tuned

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers

model_id = "google/gemma-2b-it"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="mps",
    torch_dtype=dtype,
)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Functions to generate the prompts (RAG and no RAG)

In [21]:
def create_simple_prompt(user_input):
  chat = [
      { "role": "user", "content": user_input },
  ]
  prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
  return prompt


def create_rag_prompt(user_input, embedding_model, docs, db, k=5):

  embedding = embedding_model.encode([user_input])
  _, I = db.search(embedding, k)

  context = "Relevant documents:\n"
  for i in I[0]:
    context += f"Doc {i+1}: {docs[i]}\n"

  chat = [
      { "role": "user", "content": f"Use the documents to answer the user.\n{context}\n{user_input}"},
  ]
  prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
  return prompt

## Running the pipeline

### No RAG

In [22]:
query = "who are the authors of pymatting?"

prompt = create_simple_prompt(query)
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)
print(tokenizer.decode(outputs[0]).split("<start_of_turn>model")[1])


The authors of Pymatting are:

* **David MacDiarmid**
* **Mark P. Allen**
* **David R. Reichman**
* **Daniel R. Reichman**<eos>


In [23]:
print(prompt)

<bos><start_of_turn>user
who are the authors of pymatting?<end_of_turn>
<start_of_turn>model



### Using RAG

In [24]:
query = "who are the authors of pymatting?"

prompt = create_rag_prompt(query, embedding_model, all_chunks, index, k=10)
inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)
print(tokenizer.decode(outputs[0]).split("<start_of_turn>model")[1])


The authors of pymatting are:

- Thomas Germer
- Tobias Uelwer
- Stefan Conrad
- Stefan Harmeling<eos>


In [25]:
print(prompt)

<bos><start_of_turn>user
Use the documents to answer the user.
Relevant documents:
Doc 127: readmes/pymatting.mdermer
Tobias Uelwer
Stefan Conrad
Stefan Harmeling

See also the list of contributors who participated in this project.
Projects using PyMatting

Rembg - an excellent tool for removing image backgrounds.
PaddleSeg - a library for a wide range of image segmentation tasks.
chaiNNer - a node-based image processing GUI.
LSA-Matting - improving deep image matting via local smoothness assumption.

License
This project is licensed under the MIT License - see the LICENSE.md file for details
Citing
If you found PyMatting to be useful for your work, please consider citing our paper:
@article{Germer2020,
  doi = {10.21105/joss.02481},
  url = {https://doi.org/10.21105/joss.02481},
  year = {2020},
  publisher = {The Open Journal},
  volume = {5},
  number = {54},
  pages = {2481},
  author = {Thomas 
Doc 125: readmes/pymatting.md=0.47.0
* scipy>=1.1.0
Additional requirements for GPU sup