# Semantic search with sentence embedding
Search for the comment of github issues best matching a given query using text including the issue title, issue body text and the comment

## Load data

In [13]:
from datasets import load_dataset, Dataset

In [14]:
issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [15]:
issues_dataset = issues_dataset.filter(
    lambda x: ((not x['is_pull_request']) and len(x['comments'])) > 0
)

In [16]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
# same as minus here since set 1 is a subset of set 2
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

## Explode rows

### Use pandas

In [18]:
# df = issues_dataset.to_pandas()

# comments_df = df.explode('comments', ignore_index=True)
# comments_df.head()

# comments_dataset = Dataset.from_pandas(comments_df)
# comments_dataset

### Use .map()

In [17]:
def map_to_explode(examples):
    result = {k: [] for k in examples}
    comments = examples.pop('comments')
    for i, comment_i in enumerate(comments):
        n_rows_to_explode = len(comment_i)
        for k, v in examples.items():
            result[k] += [v[i]] * n_rows_to_explode
        result['comments'] += comment_i
    return result

comments_dataset = issues_dataset.map(map_to_explode, batched=True)
comments_dataset

Map:   0%|          | 0/808 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

## Create some features

In [19]:
comments_dataset = comments_dataset.map(lambda x: {'comment_length': len(x['comments'].split())})
comments_dataset = comments_dataset.filter(lambda x: x['comment_length'] > 1)
comments_dataset

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2934
})

In [20]:
comments_dataset = comments_dataset.map(
    lambda x: {'text': " \n ".join([x['title'], x['body'], x['comments']])}
)
comments_dataset[0]

Map:   0%|          | 0/2934 [00:00<?, ? examples/s]

{'html_url': 'https://github.com/huggingface/datasets/issues/2945',
 'title': 'Protect master branch',
 'comments': 'Cool, I think we can do both :)',
 'body': 'After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:\r\n- 00cc036fea7c7745cfe722360036ed306796a3f2\r\n- 13ae8c98602bbad8197de3b9b425f4c78f582af1\r\n- ...\r\n\r\nI propose to protect our master branch, so that we avoid we can accidentally make this kind of mistakes in the future:\r\n- [x] For Pull Requests using GitHub, allow only squash merging, so that only a single commit per Pull Request is merged into the master branch\r\n  - Currently, simple merge commits are already disabled\r\n  - I propose to disable rebase merging as well\r\n- ~~Protect the master branch from direct pushes (to avoid accidentally pushing of merge commits)~~\r\n  - ~~This protection would rejec

## Create embeddings

Instructions: https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search

### Preprocessing

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
model_ckpt = 'Alibaba-NLP/gte-multilingual-base'  # "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'
Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification 

In [143]:
# Test the Ali GTE model
if model_ckpt == 'Alibaba-NLP/gte-multilingual-base':
    input_texts = [
        "what is the capital of China?",
        "中国的首都是哪儿?",
        "how to implement quick sort in python?",
        "北京",
        "快排算法介绍",
    ]

    max_length = 8192  # max 8192
    batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt')
    model.to('cpu')
    outputs = model(**batch_dict)

    dimension = 256 # Truncate the output dimension of the output embedding, should be in [128, 768]
    embeddings = outputs.last_hidden_state[:, 0][:, :dimension]  

    embeddings = F.normalize(embeddings, p=2, dim=1)
    scores = (embeddings[:1] @ embeddings[1:].T)  # Matrix mul to check the similarity between the normalized embeddings of the first and the rest
    print(scores.tolist())

[[0.8816621899604797, 0.3257666230201721, 0.7938992977142334, 0.3752664625644684]]


In [52]:
import torch

device = torch.device("cuda")
model.to(device)

NewModel(
  (embeddings): NewEmbeddings(
    (word_embeddings): Embedding(250048, 768, padding_idx=1)
    (rotary_emb): NTKScalingRotaryEmbedding()
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): NewEncoder(
    (layer): ModuleList(
      (0-11): 12 x NewLayer(
        (attention): NewAttention(
          (qkv_proj): Linear(in_features=768, out_features=2304, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (mlp): NewGatedMLP(
          (up_gate_proj): Linear(in_features=768, out_features=6144, bias=False)
          (down_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act_fn): GELUActivation()
          (hidden_dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_ln): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

#### CLS pooling
Pooling is the process of converting a sequence of embeddings into a sentence embedding is called “pooling”.  
One way is using CLS pooling: to collect the last hidden state for the special [CLS] token  
 - CLS token: Append a special <CLS> token to the start of every sequence. This special token is meant to capture the sequence-level information. 
 - During the training process, some sentence-level classification (like next sewntence prediction) task based on this CLS embedding will tune the CLS token representation via backpropagation.  
  
From [article of pooling methods](https://blog.ml6.eu/the-art-of-pooling-embeddings-c56575114cf8)


In [22]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

### Embedding

In [61]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    # Put to device
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
# Test one case
text_input = comments_dataset['text'][0]
embedding = get_embeddings(text_input)
# Detach from the computational graph, copy it to host memory, and then convert to numpy array
embedding = embedding.detach().cpu().numpy()

print(text_input, '\n', embedding.shape)

In [None]:
# Compute everything; if to select a few:  comments_dataset.select(range(10))
embeddings_dataset = comments_dataset.map(
    lambda x: {'embeddings': get_embeddings(x['text']).detach().cpu().numpy()[0]}
)

## FAISS similarity search
Using [FAISS](https://faiss.ai/) for efficient similarity search  
Here it uses method from hugging face which doesn't have good document or support. # TODO: change to generic FAISS function calls


In [54]:
# !pip install faiss-cpu
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 10
})

In [55]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [58]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: I just merged a fix, let me know if you're still having this kind of issues :)

We'll do a release soon to make this fix available
SCORE: 460.86029052734375
TITLE: Backwards compatibility broken for cached datasets that use `.filter()`
URL: https://github.com/huggingface/datasets/issues/2943

COMMENT: Definitely works on several manual cases with our dummy datasets, thank you @lhoestq !
SCORE: 459.71575927734375
TITLE: Backwards compatibility broken for cached datasets that use `.filter()`
URL: https://github.com/huggingface/datasets/issues/2943

COMMENT: Fixed by #2947.
SCORE: 457.87445068359375
TITLE: Backwards compatibility broken for cached datasets that use `.filter()`
URL: https://github.com/huggingface/datasets/issues/2943

COMMENT: I tried `unshuffled_original_da` and it is also not working
SCORE: 415.90667724609375
TITLE: OSCAR unshuffled_original_ko: NonMatchingSplitsSizesError
URL: https://github.com/huggingface/datasets/issues/2941

COMMENT: Hi @daqieq, thanks for 

# Llama 2
Refer to https://huggingface.co/docs/transformers/tasks/language_modeling

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Pipeline

In [None]:
# Not enough memory on my Legion Y9000P. Need to set pagefile to system managed
from transformers import pipeline
checkpoint = 'meta-llama/Llama-2-7b-chat-hf'
generator = pipeline("text-generation", model=checkpoint, device_map='auto')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
prompt = ['How much memory is needed to run Llama2 ']
%time response = generator(prompt, max_new_tokens=50, num_beams=2, do_sample=True, top_k=5, top_p=0.95)

CPU times: total: 3min 4s
Wall time: 8min 26s


[[{'generated_text': "How much memory is needed to run Llama2 \n\nAnswer: Llama2 is a relatively lightweight library, and it doesn't require a lot of memory to run. In fact, Llama2 is designed Limited spin faut pap Fürulen mű efectспе Welcome以 politique domin"}]]

## With config

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(checkpoint)

NameError: name 'checkpoint' is not defined

In [None]:
inputs = tokenizer(raw_inputs, 
                   # padding='longest', truncation=True, max_length=128, 
                   return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=20, num_beams=2, do_sample=True, top_k=5, top_p=0.95)

In [None]:
tokenizer.batch_decode(outputs, skip_special_tokens=True)

["Hello. Who are you?\n\nComment: Hello! I'm just an AI designed to assist and communicate with users"]

## Code Llama

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
checkpoint = 'codellama/CodeLlama-7b-Python-hf'
# generator = pipeline("text-generation", model=checkpoint, device_map='auto')


model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    # load_in_8bit=True,
    # torch_dtype=torch.float16,
    offload_folder="./save_folder",  # Need to create this folder anyway
    device_map="auto",
    # device_map={"": 0},  # not enough GPU memory
)

tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_eos_token = True
# tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"

prompt = 'Write a piece of Python code to order a list of numbers'
inputs = tokenizer(prompt, 
                   # padding='longest', truncation=True, max_length=128, 
                   return_tensors="pt"
                  )

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**inputs, max_new_tokens=100)[0], skip_special_tokens=True))

In [None]:
prompt = ['Write a piece of Python code to order a list of numbers']
%time generator(prompt, max_new_tokens=50, num_beams=2, do_sample=True, top_k=5, top_p=0.95)