In [None]:
from google.colab import files 
from collections import defaultdict,Counter
from nltk.stem.snowball import SnowballStemmer
import re
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans
import json
import io
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stemmer = SnowballStemmer("english")

#Input lemmatized cased docs
  
uploaded = files.upload()
df= pd.read_csv(io.BytesIO(uploaded['case_docs.csv']))
print(df['Content'])



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Saving case_docs.csv to case_docs (1).csv
0       Masud Khan v State Of Uttar Pradesh\r\nSupreme...
1       Indian Oil Corporation v NEPC India Limited an...
2       Gurpal Singh v State of Punjab and Others\r\nS...
3       Budh Singh and Others v State of Uttar Pradesh...
4       Anil Kumar v State of U.P.\r\nSupreme Court of...
                              ...                        
2909    Sukhram v State Of Maharashtra\r\nSupreme Cour...
2910    Satvir v State Of Uttar Pradesh\r\nSupreme Cou...
2911    Dharangadhara Chemical Works Limited v State o...
2912    Central Bank of India v M. Sethumadhavan and o...
2913    Som Raj @ Soma v State Of H.P.\r\nSupreme Cour...
Name: Content, Length: 2914, dtype: object


In [None]:
#Some preprocessing and improving the quality of words in query/ doc


#Text Preprocessing!
def give_me_synonyms(word):
  synonyms=set()
  for synset in wordnet.synsets(word):
    for lemma in synset.lemmas():
      syn=lemma.name()
      synonyms.add(syn)
  return list(synonyms)

def expand_query(query,limit=100000):

  query_processed=remove_stop_words_and_lemmatize(query)
  query_new=[]
  count=0
  for word_current in query_processed:
    query_new.append(word_current) #make sure the old word is there
    query_synonyms=give_me_synonyms(word_current)
    for word_new in query_synonyms:
      query_new.append(word_new)
      count+=1
      if(count>limit):
        return query_new
  
  return query_new

def remove_whitespace(text): 
    return  " ".join(text.split()) 
  

def remove_punctuation(text):
    #also whitespace
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator)

def remove_stop_words_and_lemmatize(sentence):
  sentence=remove_punctuation(sentence)
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(sentence) 
  word_tokens=[word.lower() for word in word_tokens]
  filtered_sentence = [w for w in word_tokens if not w in stop_words]
  filtered=[token for token in filtered_sentence if not token.isdigit()]
  lemmatizer = WordNetLemmatizer()
  lemmatized_sentence=[lemmatizer.lemmatize(w) for w in filtered_sentence]  
  return lemmatized_sentence
  


N=200
#Preprocess case docs!
DF=defaultdict(set)
doc_count=defaultdict()
doc_tokens=defaultdict()
cleaned_doc_list=[]
for i,row in df.iterrows():
    doc=row['Content']
    tokens=remove_stop_words_and_lemmatize(doc)
    doc_string=' '.join(tokens)
    #doc_string=" ".join([x for x in doc_string.split() if x.isdigit() == False])
    print(i+1,"doc processing")
    if(i==1):
      print(doc_string)
    cleaned_doc_list.append(doc_string)

documents = list(cleaned_doc_list)
df['cleaned_doc']=cleaned_doc_list
print(df)



1 doc processing
2 doc processing
indian oil corporation v nepc india limited others supreme court india 20 july 2006 cra 834 2002 judgment delivered r v raveendran j 1 appeal filed common order dated 2932001 passed madras high court allowing crlop nos2418 1999 1563 2000 said two petition filed respondent herein u 482 criminal procedure code code short quashing complaint filed appellant cc no299 1999 file judicial magistrate no6 coimbatore cc 286 1998 file judicial magistrate alandur chennai 2 appellant indian oil corporation short ioc entered two contract one first respondent nepc india ltd sister company skyline nepc limited skyline short agreeing supply aviation turbine fuel aviation lubricant together referred aircraft fuel according appellant respect aircraft fuel supplied said contract first respondent became due sum rs5282350190 skyline became due sum rs13127642125 2941997 3 first respondent hypothecated two fokker f27500 aircraft bearing registration vtnej 12684 vtnek 10687 app

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

#K Means Clustering! 

true_k = 55 #setting it to sqroot(3000) for optimal k
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(true_k):
  print("Cluster %d:" % i),
  for ind in order_centroids[i, :10]:
    print("%s" % terms[ind])
  
  print()

#We can use these clusters and predict which cluster each query belongs to!


Cluster 0:
girl
parade
gangu
child
respondent
identification
house
shop
body
court

Cluster 1:
detention
detenu
detaining
advisory
representation
delay
order
authority
board
government

Cluster 2:
pw
witness
evidence
court
prosecution
appellant
singh
accused
pw2
bhagelu

Cluster 3:
court
appointment
respondent
regularization
employee
service
labour
writ
employment
appointed

Cluster 4:
award
arbitrator
arbitration
circular
court
jetting
respondent
proceeding
claim
contractor

Cluster 5:
dying
declaration
deceased
doctor
statement
recorded
court
magistrate
renuka
janak

Cluster 6:
court
high
petition
writ
art
case
public
power
order
remark

Cluster 7:
injury
death
deceased
blow
cause
bodily
intention
appellant
300
inflict

Cluster 8:
ornament
court
acquittal
retrial
admissible
hidden
case
distinctly
high
evidence

Cluster 9:
identification
parade
appellant
witness
court
identified
accused
evidence
test
singh

Cluster 10:
wage
service
reinstatement
respondent
petitioner
court
1409
1996
o

In [None]:
#Import query.csv and use TF-IDF and get k relevant docs using cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tfIdfVectorizer=TfidfVectorizer(max_features=10000,use_idf=True,stop_words='english',min_df = 0.01,max_df=0.7)
tfIdf = tfIdfVectorizer.fit_transform(documents)

tfidf_df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
tfidf_df = tfidf_df.sort_values('TF-IDF', ascending=False)
print (tfidf_df)

                TF-IDF
estoppel      0.390167
petitioner    0.336668
issue         0.229910
proceeding    0.218524
crown         0.213927
...                ...
disagreed     0.000000
disagree      0.000000
disadvantage  0.000000
disabled      0.000000
zone          0.000000

[6714 rows x 1 columns]


In [None]:
#Upload Query.csv 
#Clean Query
#Use TFIDF, Cosine similarity to get k=3 relevant docs!

#Make sure you've run the previous code blocks before executing this!

def remove_punctuation(text):
    #also whitespace
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator)

def remove_stop_words_and_lemmatize(sentence):
  sentence=remove_punctuation(sentence)
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(sentence) 
  word_tokens=[word.lower() for word in word_tokens]
  filtered_sentence = [w for w in word_tokens if not w in stop_words]
  cleaned=[token for token in filtered_sentence if not token.isdigit()] #remove numbers
  lemmatizer = WordNetLemmatizer()
  lemmatized_sentence=[lemmatizer.lemmatize(w) for w in filtered_sentence]  
  return lemmatized_sentence

N=2914 #3000
relevant_k=25
Q=50
from sklearn.metrics.pairwise import cosine_similarity
uploaded2 =files.upload()
qdf= pd.read_csv(io.BytesIO(uploaded2['query.csv']))
queries=list(qdf['Query'])
cleaned_queries=[]
relevant_docs=[] #top 3 relevant docs per query! 
idx=0
for line in queries:
    idx+=1
    #clean query
    tokens=remove_stop_words_and_lemmatize(line)
    query_string=' '.join(tokens)
    #query_string=" ".join([x for x in query_string.split() if x.isdigit() == False])
    query_string=' '.join(expand_query(query_string)) #expanding query by adding synonyms
    cleaned_queries.append(query_string)
    query_vec = tfIdfVectorizer.transform([query_string])
    results = cosine_similarity(tfIdf,query_vec)
    cosine_similarities=[]
    for doc_index in range(N):
      cosine_similarities.append((results[doc_index][0],doc_index+1))
    top_k=sorted(cosine_similarities,key=lambda x:x[0],reverse=True)[:relevant_k] #top k docs sorted by cosine similarity
    top_k_docs=[doc_idx for cs,doc_idx in top_k] #store only the doc indices
    relevant_docs.append(top_k_docs)
  


print("Printing queries and top 10 relevant docs by TF-IDF and cosine similarity\n")
qdf["query_string"]=cleaned_queries
qdf["top_k_docs"]=relevant_docs
print(qdf)


Saving query.csv to query (10).csv
Printing queries and top 10 relevant docs by TF-IDF and cosine similarity

       Index  ...                                         top_k_docs
0    AILA_Q1  ...  [1240, 2907, 560, 723, 1051, 1012, 1776, 1542,...
1    AILA_Q2  ...  [2316, 1853, 2765, 2846, 2311, 2313, 1623, 22,...
2    AILA_Q3  ...  [246, 256, 1147, 294, 277, 1933, 103, 2322, 41...
3    AILA_Q4  ...  [1847, 859, 814, 2726, 151, 363, 2582, 2439, 1...
4    AILA_Q5  ...  [42, 1420, 2109, 707, 2827, 2830, 1874, 378, 2...
5    AILA_Q6  ...  [2771, 2201, 1529, 1805, 1874, 2109, 822, 880,...
6    AILA_Q7  ...  [705, 1968, 1630, 2392, 797, 2890, 1352, 676, ...
7    AILA_Q8  ...  [1766, 744, 169, 632, 2103, 751, 1431, 474, 97...
8    AILA_Q9  ...  [2040, 757, 532, 2799, 2109, 987, 979, 215, 13...
9   AILA_Q10  ...  [2260, 479, 1778, 2265, 1358, 2409, 2879, 522,...
10  AILA_Q11  ...  [1354, 2246, 730, 2730, 2158, 360, 175, 810, 3...
11  AILA_Q12  ...  [2246, 1108, 2158, 499, 1658, 299, 2472, 16

In [None]:
#Uploading rel docs for comparison and evalutating!
import json
from google.colab import files
uploaded3 = files.upload()
rel_df= pd.read_csv(io.BytesIO(uploaded3['rel_docs.csv']))
print("\nTFIDF result—\n")
ans=[]
Q=50
for i,row in qdf.iterrows():
  tfidf_list=row['top_k_docs']
  given=json.loads(rel_df.iloc[i]['relevant'])
  matches=0 
  for ele in tfidf_list:
    if(ele in given):
      matches+=1
  precision_at_k=round(matches/relevant_k,2)
  recall_at_k=round(matches/len(given),2)
  ans.append({"index":"AILA_Q"+str(i+1),"top_k":tfidf_list,"relevant":given,"precision@k":precision_at_k,"recall@k":recall_at_k})



ans_df=pd.DataFrame(ans)
print(ans_df)
ans_df.to_csv('tfidf_eval.csv') 
files.download('tfidf_eval.csv')
ap=ans_df["precision@k"].sum()/Q
ar=ans_df["recall@k"].sum()/Q
print("The Avg. precision@k for all queries— ",ap)
print("The Avg. recall@k for all queries— ",ar)
f_score=(2*ap*ar)/ (ap+ar)
print("The F-score is ",f_score)

Saving rel_docs.csv to rel_docs (8).csv

TFIDF result—

       index  ... recall@k
0    AILA_Q1  ...     0.00
1    AILA_Q2  ...     0.50
2    AILA_Q3  ...     0.00
3    AILA_Q4  ...     0.00
4    AILA_Q5  ...     0.00
5    AILA_Q6  ...     0.00
6    AILA_Q7  ...     0.00
7    AILA_Q8  ...     0.00
8    AILA_Q9  ...     0.00
9   AILA_Q10  ...     0.00
10  AILA_Q11  ...     0.50
11  AILA_Q12  ...     0.00
12  AILA_Q13  ...     0.67
13  AILA_Q14  ...     1.00
14  AILA_Q15  ...     0.00
15  AILA_Q16  ...     0.00
16  AILA_Q17  ...     0.00
17  AILA_Q18  ...     1.00
18  AILA_Q19  ...     0.00
19  AILA_Q20  ...     0.00
20  AILA_Q21  ...     0.50
21  AILA_Q22  ...     0.00
22  AILA_Q23  ...     0.00
23  AILA_Q24  ...     0.00
24  AILA_Q25  ...     0.33
25  AILA_Q26  ...     0.62
26  AILA_Q27  ...     0.20
27  AILA_Q28  ...     0.00
28  AILA_Q29  ...     0.14
29  AILA_Q30  ...     0.00
30  AILA_Q31  ...     0.00
31  AILA_Q32  ...     0.00
32  AILA_Q33  ...     1.00
33  AILA_Q34  ...     0.08

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The Avg. precision@k for all queries—  0.039
The Avg. recall@k for all queries—  0.2058
The F-score is  0.06557352941176471
