# NER-Powered Semantic Search

Combine NER technique with semantic search to improve the results

### Setup Pinecone

In [1]:
from pinecone import Pinecone
import os

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

pc.delete_index("medium-data")

pc.create_index("medium-data", dimension=768, spec={"serverless": {"cloud": "aws", "region": "us-east-1"}})

index = pc.Index("medium-data")

### Setup NER

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from sentence_transformers import SentenceTransformer
import torch

model_id = "dslim/bert-base-NER"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

device = "cuda" if torch.cuda.is_available() else "cpu"

nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=device, aggregation_strategy="max")

retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0
Device set to use cpu


In [3]:
nlp("Bill Gates is a software engineer and founder of Microsoft")

[{'entity_group': 'PER',
  'score': 0.999742,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9983804,
  'word': 'Microsoft',
  'start': 49,
  'end': 58}]

### Prepare the Dataframe

In [4]:
from datasets import load_dataset

df = load_dataset(
    "fabiochiu/medium-articles",
    data_files="medium_articles.csv",
    split="train"
).to_pandas()


df = df.dropna().sample(n=10000, random_state=42).reset_index(drop=True)

df["text_extended"] = df["title"] + " " + df["text"].astype(str)

df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags,text_extended
0,Konsep Perdagangan Adil (Fair Trade),"Sumber:\n\nJournal\n\nTaylor, Jason E, and Boa...",https://medium.com/hipotesa-indonesia/konsep-p...,['Kim Litelnoni'],2019-06-16 01:17:44.009000+00:00,"['Trade', 'Fair Trade', 'International Relatio...",Konsep Perdagangan Adil (Fair Trade) Sumber:\n...
1,Palantir Apollo: Powering SaaS where no SaaS h...,"At Palantir, our approach to software has unde...",https://blog.palantir.com/palantir-apollo-powe...,[],2020-10-08 13:30:34.138000+00:00,"['Palantirtech', 'Continuous Delivery', 'Palan...",Palantir Apollo: Powering SaaS where no SaaS h...
2,ZEROBANK announces the most feasible ICO proje...,"June 8th, 2018, Singapore — ZeroBank, the inno...",https://medium.com/zerobank-cash/zerobank-the-...,['Zerobank - Your Local Currency'],2018-07-17 03:23:38.526000+00:00,"['Sharingeconomy', 'Bitcoin', 'Blockchain', 'Z...",ZEROBANK announces the most feasible ICO proje...
3,7 Reasons Your Pitch Got Rejected,7 Reasons Your Pitch Got Rejected\n\nCommon pi...,https://medium.com/the-lucky-freelancer/7-reas...,['Alicia Wilcox'],2020-09-10 00:51:31.372000+00:00,"['Writing', 'Pitch', 'Freelance', 'Freelance W...",7 Reasons Your Pitch Got Rejected 7 Reasons Yo...
4,Why Money Mindset Is Important For Writers,"Writing, Writer, Money Mindset, Abundance\n\nI...",https://medium.com/books-and-midlife-adventure...,['Christie Adams - Writer'],2021-03-25 19:57:25.881000+00:00,"['Writers Life', 'Writers On Medium', 'Money M...",Why Money Mindset Is Important For Writers Wri...


### Batch Processing

In [5]:
def extract_entities(doc_list):
    entities = []
    for doc in doc_list:
        entities.append([entity["word"] for entity in nlp(doc)])
    return entities

batch_size = 10
for i in range(0, len(df), batch_size):
    i_end = min(i + batch_size, len(df))
    
    df_batch = df.iloc[i:i_end].copy()
    
    embeddings = retriever.encode(df_batch["text_extended"].astype(str).tolist()).tolist()
    
    entities = extract_entities(df_batch["text_extended"].astype(str).tolist())
    
    df_batch["named_entities"] = [list(set(entity)) for entity in entities]
    
    df_batch = df_batch.drop('text', axis=1)
    df_batch = df_batch.drop('url', axis=1)
    df_batch = df_batch.drop('timestamp', axis=1)
    df_batch = df_batch.drop('text_extended', axis=1)
    
    metadata = df_batch.to_dict(orient='records')
    
    ids = [f"{idx}" for idx in range(i, i_end)]
    
    vectors_to_upsert = list(zip(ids, embeddings, metadata))
    
    _ = index.upsert(vectors= vectors_to_upsert)
    
    index.describe_index_stats()

ProtocolError: Failed to connect; did you specify the correct index name?

### Query Information using NER

In [14]:
query = "How to learn Bitcoin?"

query_embedding = retriever.encode(query).tolist()
query_entities = extract_entities([query])[0]

results =index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True,
    filter = {
        "named_entities": 
            { "$in": query_entities }
    }
)

results

{'matches': [{'id': '52',
              'metadata': {'authors': "['Jiwa Ragaku']",
                           'named_entities': ['Ethereum',
                                              'Way of',
                                              'Litecoin',
                                              'SHA',
                                              '256',
                                              'Bitcoin'],
                           'tags': "['Bitcoin Mining', 'Sha 256']",
                           'title': 'SHA-256 is A Personal Way of Mining'},
              'score': 0.331458032,
              'values': []},
             {'id': '993',
              'metadata': {'authors': "['Bitxmi Pte Ltd']",
                           'named_entities': ['Kataryna',
                                              'Bitxmi',
                                              'CME',
                                              'Exchange',
                                              'Chicago Board

In [15]:
for result in results["matches"]:
    print(f"{result["score"]} - {result["metadata"]["title"]}")

0.331458032 - SHA-256 is A Personal Way of Mining
0.232855231 - Bitcoin rises in price and broke through $ 23 thousand.
0.182112113 - Blockchain: Is it More than Hype?
0.176162601 - Why Blockchain is Not the Answer
0.175867498 - Bitcoin Price Slump Ahead, Says Analyst
