In [1]:
# Preprocessing steps
# Load Nepali stopwords
# You may need to download a Nepali stopword list
import re
import json

# Extract stop words for nepali texts
nepali_stopwords = []
with open('dataset/non-potential-topic-word-list.txt', 'r', encoding='utf-8') as f:
    nepali_stopwords = [line.strip() for line in f]  # Use a set for efficient lookup
    

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [2]:

# Assuming you have your custom_nepali_tokenizer defined as shown previously
from nepalikit.tokenization import Tokenizer

nepali_tokenizer = Tokenizer()
def custom_nepali_tokenizer(text):
    return nepali_tokenizer.tokenize(text, level='word')


In [3]:
from transformers import pipeline
from tqdm import tqdm  # Optional: For progress bar on large docs

# Initialize NER pipeline once
ner_pipeline = pipeline(
    "ner", 
    model="Davlan/xlm-roberta-base-ner-hrl", 
    aggregation_strategy="simple"
)

# Define the entity types to remove
ENTITY_TYPES_TO_REMOVE = {"PER"}  # Modify if needed: {"PER", "LOC", "ORG"}

# Function to remove named entities from a document
def remove_named_entities(text, ner_model, entity_types=ENTITY_TYPES_TO_REMOVE):
    entities = ner_model(text)
    # Sort entities in reverse order of start position to avoid offset shifting
    to_remove = sorted(
        [e for e in entities if e["entity_group"] in entity_types],
        key=lambda x: x["start"],
        reverse=True
    )
    for ent in to_remove:
        text = text[:ent["start"]] + text[ent["end"]:]
    return text.strip()



Device set to use cpu


In [4]:
def split_long_text(text, max_words=1000):
    # Split by danda ("।") — common Nepali sentence end
    sentences = text.split("।")
    
    chunks = []
    current_chunk = ""
    current_word_count = 0

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        
        word_count = len(sentence.split())
        
        # If adding the sentence keeps us under the limit
        if current_word_count + word_count <= max_words:
            current_chunk += sentence + "। "
            current_word_count += word_count
        else:
            # Save current chunk and start new one
            chunks.append(current_chunk.strip())
            current_chunk = sentence + "। "
            current_word_count = word_count

    # Add last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

In [5]:
# Extract stop words for nepali texts

    
# Stem the extra words
def nepali_stemming(word):

    suffixes = []
    with open('dataset/stemming_words.txt', 'r', encoding='utf-8') as f:
        suffixes = [line.strip() for line in f]  # Use a set for efficient lookup
    
    # Define basic stemming rules
    
    word = word.replace('हरू', '')
    word = word.replace('हरु', '')
    for suffix in suffixes:
        if word.endswith(suffix):
            word = word[: -len(suffix)]
            
    if len(word) > 1 and word != "र":
        return word
    return ''



# Custom preprocessor to remove numbers and punctuation, keeping only words
def remove_numbers_and_punctuation(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)  # Remove all digits
    # Optionally, you can also remove punctuation here if needed
    text = re.sub(r'[^\w\s\u0900-\u097F]', '', text)  # Keep only words and Nepali characters
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with single space and strip leading/trailing spaces
    text = text.replace('।','')
    
    text = nepali_stemming(text)
    return text



In [6]:


folder_path = 'dataset/decisions'
data = []

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                file_data = json.load(f)
                if isinstance(file_data, list):
                    data.extend(file_data)
                else:
                    print(f"Skipping {filename}: not a list")
            except json.JSONDecodeError as e:
                print(f"Error reading {filename}: {e}")

print(f"Total entries combined: {len(data)}")
print(data[0])


Total entries combined: 1342
{'Case Number': '०७६-WO-०८५४', 'Court': 'सर्वोच्च अदालत, एक न्यायाधीशको इजलास माननीय न्यायाधीश श्री तेजबहादुर के.सी.', 'Date': '२०७६/११/१९', 'विषय': 'परमादेश', 'निवेदक': 'अधिवक्ता पूर्ण राजवंशी', 'विपीक्षी': 'प्रधानमन्त्री तथा मन्त्रिपरिषद्\u200cको कार्यालय, सिंहदरबार, काठमाडौँसमेत', 'Judgement': 'यसमा रिट निवेदनसहितको मिसिल संलग्न कागजातहरू अध्ययन गरी निवेदकतर्फबाट उपस्थित हुनुभएका विद्वान् अधिवक्ताहरू श्री अनन्तराज लुइँटेल, श्री बल्लभराज बस्नेत, श्री विकास भट्टराई, श्री धनेन्द्र श्रेष्ठ, श्री श्रवनकुमार चौधरी, श्री टेकबहादुर पौडेल, श्री सन्तोष भण्डारी, श्री गोमा सुवेदी, श्री सपना सुवेदी, श्री उज्ज्वल घिमिरे, श्री लक्ष्मी थापा र निवेदक स्वयं पूर्ण राजवंशीले गर्नुभएको बहससमेत सुनियो। यसमा के कसो भएको हो ? निवेदकको मागबमोजिमको आदेश किन जारी हुनु नपर्ने हो? मागबमोजिमको आदेश जारी हुनु नपर्ने कुनै आधार, कारण भए सबुद प्रमाणसहित म्याद सूचना पाएका मितिले बाटाका म्यादबाहेक १५ दिनभित्र विपक्षीहरूले महान्यायाधिवक्ताको कार्यालयमार्फत लिखित जवाफ पेस गर्नु भनी आदेश र नि

In [7]:


# 1. Load your data (replace with your actual data loading)

import pandas as pd
import json
import nltk
from nltk.tokenize import word_tokenize

#df = pd.read_json("csvjson.json")
#df = pd.read_json("dataset/decisions/2077_Supreme_Court_Gender_Justice_Cleaned.json")

# Open and read the JSON file
#with open('dataset/decisions/2077_Supreme_Court_Gender_Justice_Cleaned.json', 'r' , encoding = 'UTF-8') as file:
    #data = json.load(file)

docs = []

for index in range(len(data)):
     
    judgement_text = data[index]['Judgement']

    
    chunks = split_long_text(judgement_text, max_words=1000)


    for i, chunk in enumerate(chunks):
        chunk_cleaned = chunk.replace('\n', ' ')
        # Tokenize the text
        words = chunk_cleaned.split(' ')
   
        # Apply stemming to each word
        stemmed_words = [remove_numbers_and_punctuation(word) for word in words]
        
        stemmed_words = [word for word in stemmed_words if word != '']
        
        # Join the stemmed words back into a string
        stemmed_text = " ".join(stemmed_words)
        
        docs.append(stemmed_text) 
        
print('docs', len(docs))

docs 1416


In [8]:
output_dir = 'dataset/docs_saved.txt'


with open(output_dir, "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

In [9]:
import gc
import time

# Main loop: batch-safe NER cleaning
def clean_docs_in_batches(docs, batch_size=10):
    cleaned = []
    for i in tqdm(range(0, len(docs), batch_size)):
        batch = docs[i:i + batch_size]
        try:
            batch_cleaned = [remove_named_entities(doc, ner_pipeline) for doc in batch]
        except RuntimeError as e:
            print(f"[Error] at batch {i}: {e}")
            time.sleep(2)
            gc.collect()
            batch_cleaned = batch  # fallback to unprocessed if crash
        cleaned.extend(batch_cleaned)
        gc.collect()  # Clear memory after each batch
    return cleaned

# Apply NER cleaning to all documents (with progress bar)
docs = clean_docs_in_batches(docs, batch_size=10)  # Adjust batch size depending on memory

100%|██████████| 142/142 [1:26:23<00:00, 36.50s/it]   


In [10]:
#output_dir = 'dataset/docs_saved.txt'
output_dir = 'dataset/docs_saved_ner.txt'


with open(output_dir, "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

In [11]:
# Open and read the content of data.txt
with open(output_dir, 'r', encoding='utf-8') as file:
    content = file.read()

# If the content is already a valid JSON list, just load it
import json

try:
    docs = json.loads(content)
except json.JSONDecodeError:
    # If not, try to manually split paragraphs (fallback)
    docs = [para.strip() for para in content.split('\n') if para.strip()]

# Print the list or process it further
print(docs[0:10])

['यस रिट निवेदन मिसिल संलग्न कागजात अध्ययन गरी निवेदक उपस्थित हुनुभए विद्वान् अधिवक्ता श्री श्री श्री श्री श्री श्री श्री श्री श्री श्री श्री निवेदक स्वयं पूर्ण राजवंशी गर्नु बहस सुनियो यस के कसो हो निवेदक मागबमोजिम आदेश किन जारी हुनु नपर्ने हो मागबमोजिम आदेश जारी हुनु नपर्ने कुनै आधार कारण भए सबुद प्रमाण म्याद सूचना पाए मिति बाटा म्यादबाहेक दिन विपक्षी महान्यायाधिवक्ता कार्यालय लिखित जवाफ पेस गर्नु भनी आदेश निवेदन प्रतिलिपि साथै राखी विपक्षी नाम म्याद सूचना जारी गरी लिखित जवाफ परे वा अवधि नाघेपछि नियमानुसार गरी पेस गर्नु प्रस्तुत विषय व्यवस्थापकीय पक्ष अन्तरिम आदेश माग सम्बन्ध विचार गर्दा निवेदक जिकिर नाजायज पनि देखिदैन भने सरकार पनि संवेदनशील नै रहे देखिन्छ निवेदक उठाए कोभिड कोरोना भाइरस सङ्क्रमण भयावह कुनै एक भूगोल मात्र सीमित रहे विषय नरहे स्पष्ट यस जोगिन सङ्क्रमण देश भित्रिनै नदिन विपपूर्व जोखिम व्यवस्थापन समयमै हुन अति आवश्यक लोक कल्याणकारी प्रजातान्त्रिक व्यवस्था सरकार देश जनताप्रति उत्तरदायी जिम्मेवार त्यसप्रति गम्भीर रहे नै हुने गर्छ यस केही शं गरिरहनु पर्दैन यस रोकथाम विपपूर्