In [18]:
import pandas as pd
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# replace with file to be classified
df_pending = pd.read_csv('pending-cases-mumbai.csv')
df_disposed = pd.read_csv('disposed-cases-mumbai.csv')

df_pending = df_pending.dropna(subset=['Petitioner', 'Respondent'], how='any')
df_pending[['Petitioner', 'Respondent']] = df_pending[['Petitioner', 'Respondent']].astype(str)

df_disposed = df_disposed.dropna(subset=['Petitioner', 'Respondent'], how='any')
df_disposed[['Petitioner', 'Respondent']] = df_disposed[['Petitioner', 'Respondent']].astype(str)

df = pd.concat([df_pending, df_disposed])

df['PetitionerType'] = None
df['RespondentType'] = None

# Get all words from 'petitioner' and 'respondent' columns of 'df'
all_names = df['Petitioner'].tolist() + df['Respondent'].tolist()

# Convert all names to lowercase and split them into words
all_names = [name.lower().split() for name in all_names]

# Flatten the list of names into a list of words
all_words = [word for name in all_names for word in name]

# Remove punctuation from all words
all_words = [word.strip(string.punctuation) for word in all_words]

# Remove words shorter than 3 characters (Mr, Mrs, initials etc.)
all_words = [word for word in all_words if len(word) > 2]

# Remove any empty strings from the list of words
all_words = [word for word in all_words if word != '']

# Remove any words that contain non-alphabetic characters
all_words = [word for word in all_words if word.isalpha()]

# Remove stopwords
all_stopwords = nltk.corpus.stopwords.words('english')
stopwords = [word.lower() for word in all_stopwords]
all_words_filtered = [word for word in all_words if not word in stopwords]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/siddarth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/siddarth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'pending-cases-mumbai.csv'

In [15]:
from nltk.corpus import wordnet as wn

english_terms = {}
for word in all_words_filtered:
    if (len(wn.synsets(word)) > 0):
        if word in english_terms:
            english_terms[word] += 1
        else:
            english_terms[word] = 1

with open('englishterms.csv', 'w') as f:
    f.writelines('Word,Frequency,Ignore'+ '\n')
    for word in english_terms:
        f.writelines(word + "," + str(english_terms[word]) + '\n')

After this step, the file needs to be looked at, and cleaned. 
Remove names, and words like "lawyer", "agent", "represented", "representative", "alias"
Given this is a manual task, a coverage of 95% of all unique english words should suffice

In [16]:
df_english = pd.read_csv('englishterms-clean.csv')
df_filtered = df_english[(df_english['Frequency'] >5) & (df_english['Ignore'] != 'Yes')]
column_values = df_filtered['Word'].tolist()
english_words = set(column_values)

# terms for keyword matching
financeKeys = set(['bank', 'branch', 'financ', 'finco', 'fin ', 'broker', 'share', 'stock', 'invest', 'leasing', 'loan', 'asset', 'payment', 'credit', 'capital', 'chit', 'fund', 'insurance', 'mutual', 'securit', 'wealth', 'finserv', 'finlease', 'axis', 'icici', 'sbi', 'kotak', 'hdfc', 'idbi'])
companyKeys = set(['ltd', 'limited', 'pvt', 'corporation', 'industr', 'company', 'commercial', 'co-op', 'coop', 'cooperative'])
individualKeys = set(['proprietor', 'prop', 'pro '])

# dictionary of all masked entities
maskedEntity = {}

# create counters for each type
financeCounter = 0
companyCounter = 0
individualCounter = 0

def tag_type(row, columnName):
    global financeCounter, companyCounter, individualCounter
    originalName = row[columnName]

    if originalName in maskedEntity:
        return maskedEntity[originalName]
    
    name = row[columnName].lower().replace('.', '').replace(',','')
    for term in financeKeys:        
        if term in name:
            financeCounter += 1
            maskedEntity[originalName] = ['Finance', term, 'Financial Firm ' + str(financeCounter)]
            return maskedEntity[originalName]

    for term in companyKeys:
        if term in name:
            companyCounter += 1
            maskedEntity[originalName] = ['Non Financial Firm', term, 'Company ' + str(companyCounter)]
            return maskedEntity[originalName]
    
    for term in individualKeys:
        if term in name:
            individualCounter += 1
            maskedEntity[originalName] = ['Individual', term, 'Person ' + str(individualCounter)]
            return maskedEntity[originalName]

    wordsInName = set(name.split())
    for commonword in english_words:        
        if commonword in wordsInName:
            companyCounter += 1
            maskedEntity[originalName] = ['Non Financial Firm', commonword, 'Company ' + str(companyCounter)]
            return maskedEntity[originalName]

    individualCounter += 1
    maskedEntity[originalName] = ['Individual', None, 'Person ' + str(individualCounter)]
    return maskedEntity[originalName]

# apply the function and split the results into separate columns
df['PetitionerType'], df['PetitionerPattern'], df['MaskedPetitionerName'] = zip(*df.apply(lambda row: tag_type(row, 'Petitioner'), axis=1))
df['RespondentType'], df['RespondentPattern'], df['MaskedRespondentName'] = zip(*df.apply(lambda row: tag_type(row, 'Respondent'), axis=1))

df.drop(['Petitioner', 'Respondent','Sr No'], axis=1, inplace=True)

df['Case Sr'] = range(1, len(df) + 1)
df['Year'] = df['Case ID'].apply(lambda x: x[-4:])

df = df[['Case Sr','Case ID', 'Court Name', 'Case Number', \
         'MaskedPetitionerName', 'PetitionerPattern', 'PetitionerType',\
                'MaskedRespondentName', 'RespondentPattern', 'RespondentType', 'Case Status', 'Year']]

# replace with name of output file which is classified
df.to_csv('mumbai-all-cases-classified.csv', index=False)