**Importing Modules**

In [None]:
import glob
import functools
import datetime as dt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Data Handling**

We first wrap up all the text files into a single CSV file

In [None]:
import glob
import csv
#Statutes
read_files = glob.glob('/content/drive/My Drive/DataSet/*')

with open("object_statutes.csv", "w", encoding='utf-8') as outfile:
    w=csv.writer(outfile)
    for f in read_files:
        with open(f, "r", encoding='utf-8') as infile:
            w.writerow([" ".join([line.strip() for line in infile])])

lst_arr = os.listdir('/content/drive/My Drive/DataSet/')
df_filename = pd.DataFrame(lst_arr, columns = ['Name'])
df_filename

Unnamed: 0,Name
0,C6S61.txt
1,C6S62.txt
2,C6S63.txt
3,C6S64.txt
4,C6S65.txt
...,...
518,C30S401.txt
519,C30S402.txt
520,C30S403.txt
521,C30S404.txt


A Glimpse about how the data inside the csv file looks!

In [None]:
#Prior cases
df = pd.read_csv('object_statutes.csv',header=None)
df.columns = ["Text"]
df

Unnamed: 0,Text
0,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
1,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
2,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
3,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
4,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
...,...
518,Chapter Name : REFERENCE AND REVISION Title : ...
519,Chapter Name : REFERENCE AND REVISION Title : ...
520,Chapter Name : REFERENCE AND REVISION Title : ...
521,Chapter Name : REFERENCE AND REVISION Title : ...


In [None]:
df = pd.concat([df_filename, df], axis = 1)
df

Unnamed: 0,Name,Name.1,Name.2,Text,clean_text
0,C6S61.txt,C6S61.txt,C6S61.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
1,C6S62.txt,C6S62.txt,C6S62.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
2,C6S63.txt,C6S63.txt,C6S63.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
3,C6S64.txt,C6S64.txt,C6S64.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
4,C6S65.txt,C6S65.txt,C6S65.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
...,...,...,...,...,...
518,C30S401.txt,C30S401.txt,C30S401.txt,Chapter Name : REFERENCE AND REVISION Title : ...,chapter name reference revision title high cou...
519,C30S402.txt,C30S402.txt,C30S402.txt,Chapter Name : REFERENCE AND REVISION Title : ...,chapter name reference revision title power hi...
520,C30S403.txt,C30S403.txt,C30S403.txt,Chapter Name : REFERENCE AND REVISION Title : ...,chapter name reference revision title option c...
521,C30S404.txt,C30S404.txt,C30S404.txt,Chapter Name : REFERENCE AND REVISION Title : ...,chapter name reference revision title statemen...


Let us get some basic information about the data

In [None]:
len(df)

523

In [None]:
df.shape

(523, 2)

In [None]:
df.info

<bound method DataFrame.info of             Name                                               Text
0      C6S61.txt  Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
1      C6S62.txt  Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
2      C6S63.txt  Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
3      C6S64.txt  Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
4      C6S65.txt  Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
..           ...                                                ...
518  C30S401.txt  Chapter Name : REFERENCE AND REVISION Title : ...
519  C30S402.txt  Chapter Name : REFERENCE AND REVISION Title : ...
520  C30S403.txt  Chapter Name : REFERENCE AND REVISION Title : ...
521  C30S404.txt  Chapter Name : REFERENCE AND REVISION Title : ...
522  C30S405.txt  Chapter Name : REFERENCE AND REVISION Title : ...

[523 rows x 2 columns]>

**Text preprocessing techniques**: Cleansing the data
- Convert to lowercase, remove punctuation and special characters, using RegeX and strip
- Remove stopwords
- Stemming
- Lemmatization

In [None]:
import re
text = df.iloc[0]
print(text)
text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
txt = text.split()
print(txt)

Name                                            C6S61.txt
Text    Chapter Name : PROCESSES TO COMPEL APPEARANCE ...
Name: 0, dtype: object
['name', 'c6s61txt', 'text', 'chapter', 'name', 'processes', 'to', 'compel', 'appearance', 'name', '0', 'dtype', 'object']


In [None]:
import nltk
nltk.download('stopwords')
lst_stopwords = nltk.corpus.stopwords.words("english")
txt = [word for word in txt if word not in lst_stopwords]
print(txt)

['name', 'c6s61txt', 'text', 'chapter', 'name', 'processes', 'compel', 'appearance', 'name', '0', 'dtype', 'object']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
ps = nltk.stem.porter.PorterStemmer()
print([ps.stem(word) for word in txt])

['name', 'c6s61txt', 'text', 'chapter', 'name', 'process', 'compel', 'appear', 'name', '0', 'dtype', 'object']


In [None]:
nltk.download('omw-1.4')
nltk.download('wordnet')
lem = nltk.stem.wordnet.WordNetLemmatizer()
print([lem.lemmatize(word) for word in txt])

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


['name', 'c6s61txt', 'text', 'chapter', 'name', 'process', 'compel', 'appearance', 'name', '0', 'dtype', 'object']


**Preprocessing the data**: Apply these techniques on all records of the dataset

In [None]:
def utils_preprocess_text(text, lst_stopwords, flg_stemm=True, flg_lemm =True):
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    lst_text = text.split()
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in
                   lst_stopwords]
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
    text = " ".join(lst_text)
    return text
df['clean_text'] = df['Text'].apply(lambda x: utils_preprocess_text(x, lst_stopwords, flg_stemm = False, flg_lemm=True))

A glimpse into the cleansed data!

In [None]:
df

Unnamed: 0,Name,Text,clean_text
0,C6S61.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
1,C6S62.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
2,C6S63.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
3,C6S64.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
4,C6S65.txt,Chapter Name : PROCESSES TO COMPEL APPEARANCE ...,chapter name process compel appearance section...
...,...,...,...
518,C30S401.txt,Chapter Name : REFERENCE AND REVISION Title : ...,chapter name reference revision title high cou...
519,C30S402.txt,Chapter Name : REFERENCE AND REVISION Title : ...,chapter name reference revision title power hi...
520,C30S403.txt,Chapter Name : REFERENCE AND REVISION Title : ...,chapter name reference revision title option c...
521,C30S404.txt,Chapter Name : REFERENCE AND REVISION Title : ...,chapter name reference revision title statemen...


In [None]:
train = df["clean_text"]
train

0      chapter name process compel appearance section...
1      chapter name process compel appearance section...
2      chapter name process compel appearance section...
3      chapter name process compel appearance section...
4      chapter name process compel appearance section...
                             ...                        
518    chapter name reference revision title high cou...
519    chapter name reference revision title power hi...
520    chapter name reference revision title option c...
521    chapter name reference revision title statemen...
522    chapter name reference revision title high cou...
Name: clean_text, Length: 523, dtype: object

In [None]:
train[0]

'chapter name process compel appearance section name summons title form summons description every summons issued court code shall writing duplicate signed presiding officer court officer high court may time time rule direct shall bear seal court'

Keyword Conversion

In [None]:
trainset = []
def uniqueWord(Word):
    if Word in dict:
        dict[Word] += 1
    else:
        dict.update({Word: 1})
def UnQ(text):
    arr = []
    ListOfWords = re.split("[\W]+", text)
    for words in ListOfWords:
        uniqueWord(words)
    for elements in dict:
        if len(elements) > 1:
            arr.append(elements)
    return arr
for i in train:
    dict = {}
    trainset.append(UnQ(i))
print(trainset)



Creating a test dataframe using the Query File

 Jaccard Similarity Function

In [None]:
Q = {'Legal', 'Rights', 'Hit', 'Run', 'Police'}
d = {}
def SimQ(text, Q):
    count = 0
    for i in text:
        for j in Q:
            if i == j:
                count += 1
    return count
for k in range(len(trainset)):
    d[k] = [trainset[k], SimQ(trainset[k], Q)]
print(d)



In [None]:
for i in d:
    if d[i][1] > 0:
        print(i, d[i][1])

0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 2
11 1
12 2
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 2
21 2
22 1
23 1
24 1
25 1
26 1
27 1
28 1
29 1
30 1
31 1
32 1
33 1
34 1
35 1
36 1
37 2
38 1
39 1
40 2
41 1
42 2
43 1
44 1
45 1
46 1
47 1
48 1
49 1
50 1
51 1
52 1
53 1
54 1
55 1
56 1
57 1
58 1
59 1
60 1
61 1
62 1
63 1
64 1
65 1
66 1
67 1
68 1
69 2
70 1
71 1
72 1
73 1
74 1
75 2
76 1
77 1
78 1
79 1
80 1
81 1
82 1
83 1
84 1
85 1
86 2
87 1
88 3
89 1
90 1
91 1
92 1
93 1
94 3
95 1
96 3
97 3
98 2
99 2
100 3
101 1
102 1
103 2
104 1
105 2
106 1
107 1
108 1
109 1
110 1
111 1
112 2
113 1
114 1
115 1
116 2
117 1
118 1
119 2
120 1
122 1
123 1
124 1
125 1
126 1
127 1
128 1
129 1
130 1
131 1
132 1
133 1
134 1
135 1
136 2
137 1
138 1
140 1
142 2
146 1
147 2
148 2
149 3
150 2
151 1
152 1
153 1
154 2
155 1
156 1
157 2
158 1
159 2
160 1
161 2
162 1
163 3
164 1
165 1
166 2
167 1
168 1
169 2
170 1
171 1
172 2
173 1
174 1
176 1
177 1
178 1
179 2
180 1
181 1
182 1
183 1
184 1
185 2
186 1
187 1
188 2
189 2
190 2
191 1


Similarity function using bag of words model

In [None]:
def uniqueWord(Word):
    if Word in dict:
        dict[Word] += 1
    else:
        dict.update({Word: 1})
def UnQ(text):
    ListOfWords = re.split("[\W]+", text)
    for words in ListOfWords:
        uniqueWord(words)
dict = {}
arr = []
for i in train:
    UnQ(i)
for elements in dict:
    if len(elements) > 1:
        arr.append(elements)
print(arr)



In [None]:
train1 = []
for i in train:
    train1.append(re.sub(r"[^a-zA-Z0-9]", " ", i.lower()).split())
print(train1)



In [None]:
def calculateBOW(trainset,train):
    Bow = []
    for i in range(len(train)):
        #tf_diz = dict.fromkeys(trainset[i],0)
        doc = train[i]
        t=[]
        for w in trainset:
            #tf_diz[word] = doc.count(word)
            t.append(doc.count(w))
        Bow.append(t)
    return Bow
bow = calculateBOW(arr, train1)
print(bow[0])

[1, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 4, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
df_bow = pd.DataFrame(bow, columns = arr)
df_bow.head()

Unnamed: 0,chapter,name,process,compel,appearance,section,summons,title,form,description,...,386,389,390,391,composing,erroneous,importance,overruling,revised,388
0,1,2,1,1,1,1,3,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,2,1,1,1,1,6,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,2,1,1,1,2,3,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,2,1,1,1,2,3,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,2,1,1,1,4,3,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def calculateBOWQ(trainset,train):
    t=[]
    ar = []
    for w in trainset:
      t.append(train.count(w))
    ar.append(t)
    return ar

In [None]:
Q = "What are the legal procedures for a person under police custody?, What are Ravi's rights as an accused?, What is the procedure for bail in theft cases?"
Q = re.sub(r"[^a-zA-Z0-9]", " ", Q.lower()).split()
Qs = calculateBOWQ(arr, Q)
df_Q = pd.DataFrame(Qs, columns = arr)
df_Q

Unnamed: 0,chapter,name,process,compel,appearance,section,summons,title,form,description,...,386,389,390,391,composing,erroneous,importance,overruling,revised,388
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import numpy as np
from numpy.linalg import norm
def cosim(A, B):
  temp = np.dot(A,B)/(norm(A)*norm(B))
  return temp
co = []
for i in bow:
  c = cosim(i, Qs[0])
  co.append(c)
print("Cosine Similarity:", co)

Cosine Similarity: [0.0, 0.09245003270420485, 0.0, 0.11180339887498948, 0.06913011298202835, 0.06913011298202835, 0.040723148118768406, 0.027874733666903025, 0.0, 0.0, 0.12253577034896795, 0.21758445525607323, 0.19987511706556305, 0.11470786693528086, 0.1386750490563073, 0.1692243608556032, 0.0, 0.14547859349066158, 0.15075567228888181, 0.15027827244457206, 0.27450980392156865, 0.11063197515266965, 0.09776474334756886, 0.05869672708837492, 0.052558833122763673, 0.04578685464956301, 0.02993421700446249, 0.07453559924999299, 0.09622504486493763, 0.0, 0.06178020632152154, 0.06917144638660747, 0.05735393346764043, 0.1111111111111111, 0.03241444255508768, 0.0401286176952564, 0.1643989873053573, 0.20704985219709324, 0.042333375666730164, 0.044946657497549475, 0.10947155477532401, 0.08247860988423225, 0.22086305214969307, 0.035533452725935076, 0.04622501635210243, 0.06786487980790502, 0.09560158532445781, 0.11456956714162013, 0.08574929257125441, 0.0807739686121094, 0.08716272672808179, 0.034

Ranking Function

In [None]:
ma = {ind : co[ind] for ind in range(len(co))}
fin = sorted(ma.items(), key = lambda kv: kv[1], reverse = True)
print(fin)

[(157, 0.37151880838356854), (94, 0.3583466776169477), (154, 0.35494260376644554), (163, 0.3481553119113957), (337, 0.3447251266633461), (347, 0.3333333333333333), (323, 0.33168575341109735), (215, 0.32867652711038886), (168, 0.3207501495497921), (96, 0.29814239699997197), (326, 0.2951406680504776), (147, 0.29455738811230814), (156, 0.2936101097573517), (167, 0.28867513459481287), (165, 0.27888667551135854), (340, 0.2779446327672477), (162, 0.27689287021395464), (20, 0.27450980392156865), (139, 0.27359422722270776), (449, 0.26721255601313854), (149, 0.2631578947368421), (84, 0.2618914004394621), (216, 0.25925925925925924), (152, 0.2576996786108671), (102, 0.25660011963983365), (442, 0.2562034240393074), (169, 0.2561166066426108), (161, 0.25542034272564035), (329, 0.24300126677321918), (433, 0.24111311643235145), (444, 0.2408073206538235), (142, 0.2405551987164599), (153, 0.2394737360356999), (506, 0.23923566684866998), (240, 0.23791547571544322), (237, 0.23390435267563514), (143, 0.232

In [None]:
req = df.iloc[:, 0]
for j in fin:
  for i in range(len(req)):
    if j[0] == i:
      print(req[i], j[1])

C5S51.txt 0.37151880838356854
C12S167.txt 0.3583466776169477
C5S50.txt 0.35494260376644554
C5S55A.txt 0.3481553119113957
C33S436.txt 0.3447251266633461
C33S441A.txt 0.3333333333333333
C33S437.txt 0.33168575341109735
C17AS223.txt 0.32867652711038886
C5S59.txt 0.3207501495497921
C12S169.txt 0.29814239699997197
C33S439.txt 0.2951406680504776
C5S43.txt 0.29455738811230814
C5S50A.txt 0.2936101097573517
C5S58.txt 0.28867513459481287
C5S56.txt 0.27888667551135854
C33S438.txt 0.2779446327672477
C5S54A.txt 0.27689287021395464
C6S81.txt 0.27450980392156865
C4BS38.txt 0.27359422722270776
C25S335.txt 0.26721255601313854
C5S46.txt 0.2631578947368421
C12S160.txt 0.2618914004394621
C19AS239.txt 0.25925925925925924
C5S47.txt 0.2576996786108671
C12S175.txt 0.25660011963983365
C25S328.txt 0.2562034240393074
C5S60.txt 0.2561166066426108
C5S53A.txt 0.25542034272564035
C33S441.txt 0.24300126677321918
C24S319.txt 0.24111311643235145
C25S330.txt 0.2408073206538235
C5S41.txt 0.2405551987164599
C5S48.txt 0.239