In [1]:
import pandas as pd
import numpy as np
import tqdm
import os

import logging
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',
                    level=logging.INFO)

import nltk
from nltk import FreqDist

from fuzzywuzzy import fuzz
from bs4 import BeautifulSoup
import lxml
import re

In [2]:
def cleaner(txt_file):
    
    text = txt_file
    text = re.sub(r"(&[a-zA-Z]*;)", " ", text)  # the txt files had some unwanted text like &rsquo; this line removes such text
    text = text.lower()
    
    # remove punctuation and numbers from the string
    punctuations = '''!()[]{};:'"\,<->/?@#$%^&*_~=+`|0123456789'''  # removing punctuations except hyphens
    
    
    for x in text.lower(): 
        if x in punctuations: 
            text = text.replace(x, "")

            
    
    text = text.replace(" st ", " ")
    text = text.replace(" nd ", " ")
    text = text.replace(" rd ", " ")
    text = text.replace(" th ", " ")
    text = text.replace("hellip", " ")
    text = text.replace("rsquo", " ")
    text = text.replace("ldquo", " ")
    text = text.replace("rdquo", " ")
    text = text.replace("ndash", " ")
    text = text.replace("--", " ")
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    
    text = text.replace("kshs.", "kshs ")

    
    return text

In [3]:
df_cases = pd.read_excel("/Users/shashanksingh/Desktop/air_pollution/data/raw_data/Air Pollution Cases_updated 17.10.21.xlsx")

In [4]:
df_cases = df_cases.drop_duplicates(["Kanoon_ID"], keep = "first")
df_cases = df_cases.reset_index(drop=True)

In [5]:
root_dir = "/Users/shashanksingh/Desktop/IND_PROJ/water_pollution/Kanoon_html_new_with_air"
text_list_all = []
kanoon_id_list = []

for index in tqdm.trange(len(df_cases)):
    kanoon_id = int(df_cases["Kanoon_ID"][index])

    path = root_dir + "/" + str(kanoon_id) + ".html"      

    f = open(path, "r").read()
    soup = BeautifulSoup(f, 'lxml')
    
    # removing first 20
    text_list = cleaner(soup.findAll("div", {"class": "judgments"})[0].text).split()[20:]
    
    text_ = " ".join(ele for ele in text_list)
    
    text_list_all.append(text_)
    kanoon_id_list.append(kanoon_id)


100%|██████████| 1910/1910 [05:29<00:00,  5.80it/s] 


In [6]:
from nltk.corpus import stopwords

final_stopwords_list = stopwords.words('english')

In [7]:
final_stopwords_list = [cleaner(ele) for ele in final_stopwords_list]

final_stopwords_list = final_stopwords_list + ["of","in","the","shall","will","on","to"]

In [8]:
#import the TfidfVectorizer from Scikit-Learn.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words= final_stopwords_list,ngram_range = (3,3))
transformed_documents = vectorizer.fit_transform(text_list_all)

In [9]:
transformed_documents_as_array = transformed_documents.toarray()

In [10]:
len(transformed_documents_as_array)

1910

In [11]:
feature_names = vectorizer.get_feature_names()

In [12]:
df = pd.DataFrame(transformed_documents_as_array,
                 index=list(df_cases["Kanoon_ID"]),
                 columns=feature_names)

In [13]:
list_cols = feature_names

list_mean = []

for ele in tqdm.tqdm(list_cols):
    
    mean_score = df[ele].mean()
    
    list_mean.append(mean_score)
    
    

100%|██████████| 2446760/2446760 [07:36<00:00, 5354.56it/s] 


In [14]:
df_scores = pd.DataFrame()

In [15]:
df_scores["phrase"] = list_cols
df_scores["mean_score"] = list_mean

In [16]:
df_scores["mean_score"].max()

0.019531341537394338

In [17]:
df_scores["mean_score"].min()

3.4517589690494873e-07

In [18]:
df_scores = df_scores.sort_values(["mean_score"], ascending=False)

In [19]:
df_scores.head(50)

Unnamed: 0,phrase,mean_score
1664359,pollution control board,0.019531
2122725,state pollution control,0.011684
1705359,prevention control pollution,0.010385
529159,control pollution act,0.009576
1213057,learned counsel appearing,0.008518
90733,air prevention control,0.006804
2377093,water prevention control,0.006534
1408082,national green tribunal,0.006312
1213342,learned counsel petitioner,0.006267
392127,civil writ petition,0.006111


In [20]:
df_scores[50:100]

Unnamed: 0,phrase,mean_score
2430092,writ petition ms,0.002723
2347648,vide order dated,0.002719
1709885,principles natural justice,0.002716
2366432,vs rajasthan state,0.002611
2020473,section water act,0.002584
2430208,writ petition pil,0.002579
25601,act air prevention,0.002535
2021374,sections indian penal,0.002534
1772726,public interest litigation,0.00253
2013565,section code criminal,0.002522


In [21]:
df_scores[100:150]

Unnamed: 0,phrase,mean_score
1769706,ps case year,0.002021
345294,case year thana,0.002021
984600,gujarat pollution control,0.002015
974783,green tribunal act,0.001996
1664497,pollution control measures,0.001992
584550,criminal miscellaneous arising,0.001978
1363758,miscellaneous arising ps,0.001978
560297,court cr misc,0.001958
1921864,respondents coram honble,0.001952
2017226,section ngt act,0.001946


In [22]:
df_scores.to_csv("tfidf_scores.csv")