**Doc2Vec and distilBERT**

In [None]:
#Using the GPU
import torch
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
import io
from google.colab import files 
import pandas as pd
uploaded = files.upload()
df= pd.read_csv(io.BytesIO(uploaded['case_docs.csv']))
documents=list(df['Content'])
print(df['Content'])

Saving case_docs.csv to case_docs (1).csv
0       Masud Khan v State Of Uttar Pradesh\r\nSupreme...
1       Indian Oil Corporation v NEPC India Limited an...
2       Gurpal Singh v State of Punjab and Others\r\nS...
3       Budh Singh and Others v State of Uttar Pradesh...
4       Anil Kumar v State of U.P.\r\nSupreme Court of...
                              ...                        
2909    Sukhram v State Of Maharashtra\r\nSupreme Cour...
2910    Satvir v State Of Uttar Pradesh\r\nSupreme Cou...
2911    Dharangadhara Chemical Works Limited v State o...
2912    Central Bank of India v M. Sethumadhavan and o...
2913    Som Raj @ Soma v State Of H.P.\r\nSupreme Cour...
Name: Content, Length: 2914, dtype: object


In [None]:

#Query doc
uploaded2 = files.upload()
qdf= pd.read_csv(io.BytesIO(uploaded2['embedded_queries.csv']))
queries=list(qdf['query'])
print(qdf)

Saving embedded_queries.csv to embedded_queries.csv
    Unnamed: 0  ...                                           relevant
0            0  ...                                            [9, 14]
1            1  ...                                           [27, 22]
2            2  ...                                                [1]
3            3  ...                                              [182]
4            4  ...                            [36, 144, 54, 121, 155]
5            5  ...                                  [152, 99, 26, 19]
6            6  ...                                              [130]
7            7  ...                                      [32, 60, 125]
8            8  ...                                           [42, 90]
9            9  ...                                     [185, 180, 86]
10          10  ...                                         [131, 132]
11          11  ...                                                [8]
12          12  ...      

 **distilBERT**

In [None]:
#Cleaning the fetched documents,queries and removing unnecessary tokens
#Trying doc2vec

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import string
import pandas as pd
import json
def get_cleaned_doc(doc):
  #maintains order and removes unnecessary words
  doc=word_tokenize(doc.lower())
  unnecessary = set(stopwords.words('english'))
  for p in list(string.punctuation):
    unnecessary.add(p)
  cleaned=[i for i in doc if i not in unnecessary]
  cleaned=[token for token in cleaned if not token.isdigit()] #remove numbers
  return cleaned

tokenized_documents=[get_cleaned_doc(d) for d in documents]
documents=[" ".join(d) for d in tokenized_documents]
queries=[get_cleaned_doc(q) for q in queries] 

print(tokenized_documents[0])


['masud', 'khan', 'v', 'state', 'uttar', 'pradesh', 'supreme', 'court', 'india', 'september', 'writ', 'petition', 'judgment', 'delivered', 'a.', 'alagiriswami', 'j', '1.', 'petitioner', 'masud', 'khan', 'prays', 'release', 'ground', 'indian', 'citizen', 'illegally', 'arrested', 'confined', 'jail', 'paragraph', 'foreigners', 'internment', 'order', '1962.', 'come', 'india', 'pakistan', 'basis', 'pakistani', 'passport', 'dated', '137-1954and', 'indian', 'visa', 'dated', '9-4-1956.', 'application', 'visa', 'stated', 'migrated', 'pakistan', 'government', 'service', 'pakistan', 'p.w.d', 'darogha', 'given', 'permanent', 'address', 'hyderabad', 'sind', '2.', 'statements', 'correct', 'petitioner', 'would', 'clearly', 'pakistani', 'national', 'fact', 'brought', 'counter', 'affidavit', 'filled', 'behalf', 'respondent', 'petitioner', 'filed', 'affidavit', 'stating', 'appointed', 'police', 'constable', 'hasanganj', 'police', 'station', 'district', 'fatehpur', 'u.p', 'february', 'continued', 'police

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f5/5a/6e41e8383913dd2ba923cdcd02be2e03911595f4d2f9de559ecbed80d2d3/sentence-transformers-0.3.9.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 3.4MB/s 
[?25hCollecting transformers<3.6.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 7.6MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 43.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K  

In [None]:
#Get doc embeddings using sentence transformers (DistilBERT)
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embeddings = model.encode(documents)
print(doc_embeddings.shape)


#download embeddings
doc_embeddings=np.array(doc_embeddings)
print(doc_embeddings.shape)
np.save('case_docs_embeddings.npy', doc_embeddings)
files.download('case_docs_embeddings.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Get Query Embeddings using sentence trasnformers

queries=list(qdf["query"])
query_embeddings=model.encode(queries)
print(query_embeddings.shape)

#download embeddings
query_embeddings=np.array(query_embeddings)
np.save('query_embeddings.npy', query_embeddings)
files.download('query_embeddings.npy')




(50, 768)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Doc2Vec**

In [None]:

k=25 #number of rel docs to pick!
Q=50
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_docs = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_documents)]

model = Doc2Vec(tagged_docs, vector_size = 100, window = 2, min_count = 1, epochs = 100,workers=4)
query_results=[]
for i,query in enumerate(queries):
  query_vector = model.infer_vector(query)
  top_k=model.docvecs.most_similar(positive = [query_vector],topn=k)
  top_k_docs=[d for d,cs in top_k]

  given=json.loads(qdf.loc[(i,"relevant")])
  matches=0
  for ele in top_k_docs:
    if(ele in given):
      matches+=1
  precision_at_k=round(matches/k,2)
  recall_at_k=round(matches/len(given),2)
  query_results.append({"index":"AILA_Q"+str(i+1),"top_k":top_k_docs,"relevant":given,"precision@k":precision_at_k,"recall@k":recall_at_k})


ans_df=pd.DataFrame(query_results)
print(ans_df)
ap=ans_df["precision@k"].sum()/Q
ar=ans_df["recall@k"].sum()/Q
print("The Avg. precision@k for all queries— ",ap)
print("The Avg. recall@k for all queries— ",ar)
f_score=(2*ap*ar)/ (ap+ar)
print("The F-score is ",f_score)



       index  ... recall@k
0    AILA_Q1  ...     0.00
1    AILA_Q2  ...     0.00
2    AILA_Q3  ...     0.00
3    AILA_Q4  ...     0.00
4    AILA_Q5  ...     0.00
5    AILA_Q6  ...     0.00
6    AILA_Q7  ...     0.00
7    AILA_Q8  ...     0.00
8    AILA_Q9  ...     0.00
9   AILA_Q10  ...     0.00
10  AILA_Q11  ...     0.00
11  AILA_Q12  ...     0.00
12  AILA_Q13  ...     0.00
13  AILA_Q14  ...     0.00
14  AILA_Q15  ...     0.00
15  AILA_Q16  ...     0.00
16  AILA_Q17  ...     0.00
17  AILA_Q18  ...     0.00
18  AILA_Q19  ...     0.00
19  AILA_Q20  ...     0.00
20  AILA_Q21  ...     0.00
21  AILA_Q22  ...     0.00
22  AILA_Q23  ...     0.00
23  AILA_Q24  ...     0.00
24  AILA_Q25  ...     0.00
25  AILA_Q26  ...     0.12
26  AILA_Q27  ...     0.00
27  AILA_Q28  ...     0.00
28  AILA_Q29  ...     0.00
29  AILA_Q30  ...     0.00
30  AILA_Q31  ...     0.00
31  AILA_Q32  ...     0.00
32  AILA_Q33  ...     0.00
33  AILA_Q34  ...     0.00
34  AILA_Q35  ...     0.00
35  AILA_Q36  ...     0.00
3