# NER-Powered Semantic Search

Combine NER technique with semantic search to improve the results

### Setup Pinecone

In [1]:
from pinecone import Pinecone
import os

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

if "medium-data" not in pc.list_indexes():
    pc.create_index("medium-data", dimension=768, spec={"serverless": {"cloud": "aws", "region": "us-east-1"}})

index = pc.Index("medium-data")

### Setup NER

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch

model_id = "dslim/bert-base-NER"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

device = "cuda" if torch.cuda.is_available() else "cpu"

nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=device, aggregation_strategy="max")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0
Device set to use cpu


In [3]:
nlp("Bill Gates is a software engineer and founder of Microsoft")

[{'entity_group': 'PER',
  'score': 0.999742,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9983804,
  'word': 'Microsoft',
  'start': 49,
  'end': 58}]

In [4]:
from sentence_transformers import SentenceTransformer
import pandas as pd

retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base")

df = pd.read_csv("medium_articles_10k.csv")

df = df.drop(columns=['Unnamed: 0'])

df.head()


Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


### Prepare the Dataframe

In [5]:
df = df.dropna()

# Title + first 1000 characters of the text
df["text_extended"] = df["title"] + " " + df["text"].str[:1000]

df.head()


Unnamed: 0,title,text,url,authors,timestamp,tags,text_extended
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci...",Mental Note Vol. 24 Photo by Josh Riemer on Un...
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P...",Your Brain On Coronavirus Your Brain On Corona...
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We...",Mind Your Nose Mind Your Nose\n\nHow smell tra...
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P...",The 4 Purposes of Dreams Passionate about the ...
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology...",Surviving a Rod Through the Head You’ve heard ...


In [11]:
df_batch = df["text_extended"].iloc[0:10].tolist()

# Get the entities for each document
def extract_entities(doc_list):
    entities = []
    for doc in doc_list:
        entities.append([item["word"] for item in nlp(doc)])
    return entities

extract_entities(df_batch)

[['Josh Riemer', 'Unsplash Merry Christmas', 'Holidays'],
 ['Coronavirus', 'Coronavirus', 'ACE2', 'ACE2'],
 ['Ann', 'Sophie Barwich'],
 ['of Dreams'],
 ['Rod Through',
  'Head',
  'Phineas Gage',
  'Cherry',
  'Phineas Gage',
  'Phineas Gage',
  'Vermont',
  'Cherry',
  'Phineas Gage'],
 ['Canva Pro “ Young',
  'COVID',
  '19',
  'Holman',
  'Science Advances 2020',
  'COVID',
  'COVID',
  '19',
  'Holman'],
 ['Popular Blog Series',
  'Popular Blog',
  'John Schnobrich',
  'Unsplash',
  'Medium'],
 ['Faisal Dar',
  'Pioneer',
  'Liver',
  'Transplantation',
  'Pakistan',
  'Faisal Dar',
  'Pioneer',
  'Liver',
  'Transplantation',
  'Pakistan',
  'Fasial Dar',
  'Pakistan',
  'Shifa International Hospital',
  'Islamabad',
  'Pakistan',
  'Fatima Arif',
  'Faisal Dar',
  'Faisalabad',
  'Kotla Bhalot',
  'Kharian',
  'District Gujrat',
  'Kharian Cantt',
  'MBBS',
  'Allama Iqbal Medical',
  'College',
  'Lahore',
  'College of Physicians & Surgeons',
  'Pakistan',
  'Ireland',
  'Royal

### Batch Processing

In [None]:
df_batch_vectors = retriever.encode(df_batch)
df_batch_vectors

array([[ 0.01077694,  0.08219115, -0.00246149, ...,  0.00261581,
        -0.03726843, -0.02089519],
       [-0.00094947, -0.03033547,  0.01269015, ..., -0.00528725,
        -0.01578944, -0.03104796],
       [ 0.01388078,  0.02739986, -0.01316181, ...,  0.00952041,
        -0.00202966, -0.01931475],
       ...,
       [ 0.00156106,  0.03738706,  0.00391071, ...,  0.04735938,
         0.02094811,  0.00863307],
       [ 0.05167719, -0.02599909, -0.01801309, ..., -0.03292792,
        -0.05905078, -0.04403665],
       [-0.02391353,  0.02338496, -0.03395603, ...,  0.0610911 ,
        -0.01906551, -0.07216242]], dtype=float32)