In [21]:
from sentence_transformers import SentenceTransformer , util, CrossEncoder


In [4]:
import json
import gzip
import os
import torch

In [5]:
wikipeida_filepath = 'simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipeida_filepath):
  util.http_get('https://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz' , wikipeida_filepath)

print(wikipeida_filepath)

  0%|          | 0.00/50.2M [00:00<?, ?B/s]

simplewiki-2020-11-01.jsonl.gz


In [9]:
with gzip.open(wikipeida_filepath, 'rt', encoding = 'utf8') as file:
  for line in file:
    print(line)
    break

{"id": "9822", "title": "Ted Cassidy", "paragraphs": ["Ted Cassidy (July 31, 1932 - January 16, 1979) was an American actor. He was best known for his roles as Lurch and Thing on \"The Addams Family\"."]}



In [17]:
passages = []



with gzip.open(wikipeida_filepath, 'rt', encoding = 'utf8') as file:
  for line in file:
    data = json.loads(line.strip())
    passages.append(data['paragraphs'][0])

len(passages)

169597

In [18]:
sub_dataset = [passage for passage in passages for x in ['india' , 'nlp','machine learning',
                                                      'artificial intelligence','north pole','south pole',
                                                      'animal','cheetah','jaguar','lion'] if x in
                                                      passage.lower()]




In [19]:
sub_dataset , len(sub_dataset)

(["The integumentary system is everything covering the outside of an animal's body. This account is written mostly with people in mind, but it applies more widely.",
  'The electric eel, "Electrophorus electricus", is a species of fish. It can weigh up to six pounds. It can give electric shocks of up to six hundred fifty watts of electricity. The animal uses these shocks both for hunting and to defend itself. It lives in the stagnant muddy river bottoms of the Orinoco and the Amazon, and uses low-voltage electric fields to find its prey.',
  'The Blake River Megacaldera Complex is a giant caldera cluster that spans across the Ontario-Quebec border in Canada. It is considered to be a supervolcano because of its great size (3,000\xa0km) and its numerous dikes and vents. The cluster is about 2700 million year old.',
  'The term Far East means countries of East Asia. It became used in the English language during the period of the British Empire for lands to the east of British India.',
  "

Load Transformers model

In [22]:
if not torch.cuda.is_available():
  print('Warning: no GPU found. Neural search will be slow.')

bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [23]:
corpus_embeddings = bi_encoder.encode(sub_dataset)
corpus_embeddings

array([[-0.10502325, -0.06698408,  0.0065591 , ..., -0.00616391,
         0.01187887, -0.05167008],
       [ 0.0712673 ,  0.03758553, -0.03080118, ..., -0.00168362,
         0.06010826, -0.10081675],
       [ 0.00265437,  0.05274131,  0.0639097 , ...,  0.03401787,
         0.00516281, -0.0555965 ],
       ...,
       [ 0.03996407, -0.00559214, -0.06871913, ...,  0.01224211,
         0.06225295, -0.00726667],
       [ 0.04382559, -0.06522392,  0.10161613, ...,  0.02880343,
         0.0213261 , -0.03132001],
       [-0.08071923, -0.00659341, -0.07617452, ..., -0.01263102,
         0.07712673, -0.01625942]], dtype=float32)

In [24]:
corpus_embeddings.shape

(169597, 384)

Search similarities

In [25]:
query = "What is the capital of India?"
query

'What is the capital of India?'

In [27]:
query_embedding = bi_encoder.encode(query)
query_embedding

array([ 1.33737773e-01,  7.78846582e-03, -7.45533481e-02,  2.49207648e-03,
       -6.24073409e-02, -4.04104292e-02,  4.16974686e-02, -2.87630446e-02,
        7.81868957e-03,  1.24428151e-02,  2.99324311e-04, -1.13784306e-01,
        5.83445579e-02, -1.51665341e-02,  1.26685910e-02, -7.13799149e-02,
        3.16478983e-02, -3.98315564e-02,  6.40153885e-02, -4.27123904e-02,
        1.77848041e-02,  4.58978191e-02,  3.55000421e-02, -4.13472503e-02,
        6.97285682e-02,  7.42683141e-03,  7.50504732e-02, -7.39466995e-02,
       -3.40940170e-02, -2.02158540e-02,  3.14713307e-02, -9.29417834e-02,
        1.01784114e-02,  7.96921626e-02, -4.39619124e-02,  3.35121304e-02,
       -2.19937717e-03,  8.76956061e-02,  1.71108305e-01, -1.03098318e-01,
        7.75600225e-03,  4.10990673e-04,  1.09546026e-02,  2.07161196e-02,
       -1.01255821e-02,  4.14889567e-02, -1.23651065e-02,  2.35349070e-02,
       -2.53974833e-02, -2.56090574e-02,  7.52400905e-02, -2.78915223e-02,
       -6.31790310e-02,  

In [31]:
cos_score = util.pytorch_cos_sim(query_embedding , corpus_embeddings)[0]
cos_score

tensor([-0.1522,  0.0208,  0.1194,  ...,  0.1008, -0.0578,  0.0650])

In [32]:
top_result = torch.topk(cos_score , k=5)
idx = top_result.indices
idx

tensor([  3698,  22288, 134500,  16458,  41143])

In [33]:
passages[idx[0]]

"Mumbai (previously known as Bombay until 1996) is a natural harbor on the west coast of India, and is the capital city of Maharashtra state. It is India's largest city, and one of the world's most populous cities. It is the financial capital of India. The city is the second most-populous in the world. It has approximately 13 million people. Along with the neighboring cities of Navi Mumbai and Thane, it forms the world's 4th largest urban agglomeration. They have around 19.1 million people."

In [35]:
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k =1)
hits[0]

[{'corpus_id': 3698, 'score': 0.5979241728782654}]

In [41]:
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k =20)
hits = hits[0]

ranker_imp = [[query , passages[hit['corpus_id']]] for hit in hits]

ranker_imp[:3]

[['What is the capital of India?',
  "Mumbai (previously known as Bombay until 1996) is a natural harbor on the west coast of India, and is the capital city of Maharashtra state. It is India's largest city, and one of the world's most populous cities. It is the financial capital of India. The city is the second most-populous in the world. It has approximately 13 million people. Along with the neighboring cities of Navi Mumbai and Thane, it forms the world's 4th largest urban agglomeration. They have around 19.1 million people."],
 ['What is the capital of India?',
  "Kolkata (spelled Calcutta before 1 January 2001) is the capital city of the Indian state of West Bengal. It is the second largest city in India after Mumbai. It is on the east bank of the River Hooghly. When it is called Calcutta, it includes the suburbs. This makes it the third largest city of India. This also makes it the world's 8th largest metropolitan area as defined by the United Nations. Kolkata served as the capita

In [48]:
reranker_score = cross_encoder.predict(ranker_imp)
reranker_score[:3]

array([3.861031 , 3.4595218, 2.7084024], dtype=float32)

In [49]:
reranker_score

array([ 3.861031 ,  3.4595218,  2.7084024,  5.335995 ,  3.517104 ,
        7.0779753, -0.3657567, -7.237134 ,  2.7260745,  1.5509168,
        1.7213131, -0.8427042,  3.784327 , -1.1820912,  2.3958304,
       -0.5995951,  1.5332087,  2.4464629, -5.2075667,  3.1862845],
      dtype=float32)

In [50]:
for id, hit in enumerate(hits):
  hit['reranker_scores'] = reranker_score[id]

hits[:3]

[{'corpus_id': 3698,
  'score': 0.5979241728782654,
  'reranker_score': {...},
  'reranker_scores': 3.861031},
 {'corpus_id': 22288,
  'score': 0.5937108993530273,
  'reranker_score': {...},
  'reranker_scores': 3.4595218},
 {'corpus_id': 134500,
  'score': 0.5878057479858398,
  'reranker_score': {...},
  'reranker_scores': 2.7084024}]

In [51]:
print("Top Bi-Encoder Retrieval hit : ")
hit = sorted(hits, key=lambda x: x['score'], reverse=True)[0]
print(passages[hit['corpus_id']])


print("Top Reranker Retrieval hits :")
hit = sorted(hits, key=lambda x: x['reranker_scores'], reverse=True)[0]
print(passages[hit['corpus_id']])

Top Bi-Encoder Retrieval hit : 
Mumbai (previously known as Bombay until 1996) is a natural harbor on the west coast of India, and is the capital city of Maharashtra state. It is India's largest city, and one of the world's most populous cities. It is the financial capital of India. The city is the second most-populous in the world. It has approximately 13 million people. Along with the neighboring cities of Navi Mumbai and Thane, it forms the world's 4th largest urban agglomeration. They have around 19.1 million people.
Top Reranker Retrieval hits :
New Delhi () is the capital of India and a union territory of the megacity of Delhi. It has a very old history and is home to several monuments where the city is expensive to live in. In traditional Indian geography it falls under the North Indian zone. The city has an area of about 42.7 km. New Delhi has a population of about 9.4 Million people.


In [65]:
def search(query , top_k =30):
  query_embedding = bi_encoder.encode(query)
  hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
  hits = hits[0]

  ranker_imp = [[query, passages[hit['corpus_id']]] for hit in hits]

  reranker_score = cross_encoder.predict(ranker_imp)

  for id, hit in enumerate(hits):
    hit['reranker_scores'] = reranker_score[id]

  hit = sorted(hits , key = lambda x: x['reranker_scores'] , reverse=True)[0]
  print(passages[hit['corpus_id']])

In [66]:
search(query="What is natural language processing")

Natural Language Processing (NLP) is a field in Artificial Intelligence, and is also related to linguistics. On a high level, the goal of NLP is to program computers to automatically understand human languages, and also to automatically write/speak in human languages. We say "Natural Language" to mean human language, and to indicate that we are not talking about computer (programming) languages.


In [67]:
search(query="What is language")

Language is the normal way humans communicate. Only humans use language, though other animals communicate through other means.


In [68]:
search(query="What is the animal which can run very fast?")

A cheetah ("Acinonyx jubatus") is a medium large cat which lives in Africa. It is the fastest land animal and can run up to 112 kilometers per hour for a short time. Most cheetahs live in the savannas of Africa. There are a few in Asia. Cheetahs are active during the day, and hunt in the early morning or late evening.


In [69]:
search(query="which leader is good - Rahul Gandhi or Narendra Modi?")


Narendra Damodardas Modi (born 17 September 1950) is an Indian politician. He is the current Prime Minister of India serving since 2014. He was the 14th Chief Minister of the state of Gujarat. Modi was elected Prime Minister of India in May 2014. He is a member of Bharatiya Janata Party.


In [70]:
search(query="what skillset required to become data scientist?")


Data science is the study of the extraction of knowledge from data. It uses various techniques from many fields, including signal processing, mathematics, probability, machine learning, computer programming, statistics, data engineering, pattern matching, and data visualization, with the goal of extracting useful knowledge from the data. With computer systems able to handle more data, big data is an important aspect of data science.
