<a href="https://colab.research.google.com/github/vruddhis/nlp/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install indic-nlp-library
!pip install spacy
!python -m spacy download xx_sent_ud_sm



Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Downloading sphinx_argparse-0.5.2-py3-none-any.whl (12 kB)
Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl (7.7 MB)
[2K   [90m━

Collecting xx-sent-ud-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m120.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xx-sent-ud-sm
Successfully installed xx-sent-ud-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_sent_ud_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git


Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126 (from 1)[K
Receiving objects: 100% (139/139), 149.77 MiB | 14.18 MiB/s, done.
Resolving deltas: 100% (53/53), done.


In [3]:
from google.colab import files
uploaded = files.upload()


Saving marathi.txt to marathi.txt
Saving kannada.txt to kannada.txt
Saving hindi.txt to hindi.txt
Saving gujarati.txt to gujarati.txt


In [4]:
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp import common
from indicnlp.tokenize import sentence_tokenize
import os

In [5]:
import spacy

In [6]:
INDIC_NLP_RESOURCES = "./indic_nlp_resources"
common.set_resources_path(INDIC_NLP_RESOURCES)

#this has hindi
nlp = spacy.load("xx_sent_ud_sm")

language_files = {
    "hi": "hindi.txt",
    "mr": "marathi.txt",
    "gu": "gujarati.txt",
    "kn": "kannada.txt"
}

custom_stopwords = {
    "mr": ["आहे", "आणि", "मध्ये", "च्या", "हा", "ही"],
    "gu": ["છે", "અને", "માં", "કે", "ના"],
    "kn": ["ಇದು", "ಮತ್ತು", "ನಲ್ಲಿ", "ಒಂದು"]
} #spacy does not have stopwords for these languages

In [7]:
def preprocess_text(text, lang):
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer(lang)
    normalized_text = normalizer.normalize(text)

    #sentence and word tokenization
    sentences = sentence_tokenize.sentence_split(normalized_text, lang)
    tokens = []
    for s in sentences:
        tokens.extend(indic_tokenize.trivial_tokenize(s))

    #stopword removal
    if lang == "hi":
        stopwords = nlp.Defaults.stop_words
    else:
        stopwords = set(custom_stopwords.get(lang, []))

    filtered_tokens = [t for t in tokens if t not in stopwords and t.strip() != ""]
    return filtered_tokens

In [8]:
for lang, filename in language_files.items():
    with open(filename, "r", encoding="utf-8") as f:
        text = f.read()

    tokens = preprocess_text(text, lang)
    print(f"\n==== {lang.upper()} ({filename}) ====")
    print("Total tokens:", len(tokens))
    print("Sample tokens:", tokens[:20])


==== HI (hindi.txt) ====
Total tokens: 1251
Sample tokens: ['इसराइली', 'मीडिया', 'ने', 'ग़ज़ा', 'सीज़फ़ायर', 'पर', 'चल', 'रही', 'बातचीत', 'के', 'प्रति', 'अमेरिकी', 'राष्ट्रपति', 'डोनाल्ड', 'ट्रंप', 'के', "'", 'सकारात्मक', 'रुख़', "'"]

==== MR (marathi.txt) ====
Total tokens: 917
Sample tokens: ['गावातल्या', 'आपल्या', 'अंगणवाडीचं', 'दार', 'सकाळी', 'नऊ', 'वाजता', 'उघडून', 'अंगणवाडी', 'ताई', 'दिवस', 'सुरु', 'करतात', '.', 'प्रत्यक्ष', 'दार', 'उघडल्यानंतर', 'पुढचा', 'टप्पा', 'असतो']

==== GU (gujarati.txt) ====
Total tokens: 1112
Sample tokens: ['વેપારક્ષેત્રે', 'આગળ', 'પડતું', 'નામ', 'ગણાતા', 'ગુજરાતમાં', 'શાળાકીય', 'શિક્ષણક્ષેત્રે', "'", 'ચિંતા', 'જન્માવે', "'", 'એવા', 'સમાચાર', 'થોડા', 'દિવસ', 'પહેલાં', 'સામે', 'આવ્યા', 'હતા']

==== KN (kannada.txt) ====
Total tokens: 164
Sample tokens: ['ಧರ್ಮಸ್ಥಳ', 'ಪ್ರಕರಣ', 'ಸಂಬಂಧ', 'ಬುರುಡೆ', 'ಚಿನ್ನಯ್ಯನ', 'ವಿರುದ್ಧ', 'ತನಿಖೆ', 'ಹಾಗೂ', 'ವಿಚಾರಣೆಯನ್ನು', 'ಬಹುತೇಕ', 'ಮುಕ್ತಾಯಗೊಳಿಸಿರುವ', 'ಎಸ್', 'ಐಟಿ', 'ಚಾರ್ಜ್', 'ಶೀಟ್', 'ಸಿದ್ಧಪಡಿಸುತ್ತಿದೆ', '.', 'ಈ', 'ತಿಂಗಳ', 'ಕ

In [9]:
for lang, filename in language_files.items():
    with open(filename, "r", encoding="utf-8") as f:
        text = f.read()
    tokens = preprocess_text(text, lang)
    with open(f"processed_{lang}.txt", "w", encoding="utf-8") as f:
        f.write(" ".join(tokens))
