<a href="https://colab.research.google.com/github/vruddhis/nlp/blob/main/ngrams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Downloading sphinx_argparse-0.5.2-py3-none-any.whl (12 kB)
Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl (7.7 MB)
[2K   [90m━

In [2]:
import re
import spacy
from collections import Counter
from itertools import islice


!python -m spacy download xx_ent_wiki_sm
nlp_hi = spacy.load("xx_ent_wiki_sm")

language_files = {
    "hi": "/content/hindi.txt",
    "mr": "/content/marathi.txt",
    "gu": "/content/gujarati.txt",
    "kn": "/content/kannada.txt"
}


Collecting xx-ent-wiki-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m90.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xx-ent-wiki-sm
Successfully installed xx-ent-wiki-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
def preprocess_text(text, lang):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()

    if lang == "hi":
        doc = nlp_hi(text)
        tokens = [t.text for t in doc if not t.is_stop and not t.is_punct and t.is_alpha]
    else:
        tokens = [t for t in text.split() if len(t) > 1]
    return tokens


In [4]:
def generate_ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]


In [5]:
ngrams_stats = {}

for lang, filename in language_files.items():
    with open(filename, "r", encoding="utf-8") as f:
        text = f.read()

    tokens = preprocess_text(text, lang)
    bigrams = generate_ngrams(tokens, 2)
    trigrams = generate_ngrams(tokens, 3)

    ngrams_stats[lang] = {
        "bigrams": Counter(bigrams),
        "trigrams": Counter(trigrams)
    }

    print(f"\n=== {lang.upper()} ===")
    print("Top 10 Bigrams:")
    print(ngrams_stats[lang]["bigrams"].most_common(10))
    print("\nTop 10 Trigrams:")
    print(ngrams_stats[lang]["trigrams"].most_common(10))



=== HI ===
Top 10 Bigrams:
[('बधक क', 12), ('रह ह', 11), ('ह क', 10), ('क मतबक', 8), ('टरप क', 7), ('बतचत क', 6), ('पस पलन', 6), ('गय ह', 6), ('नतनयह क', 6), ('टरप न', 5)]

Top 10 Trigrams:
[('इसरइल बधक क', 4), ('पस पलन क', 4), ('कर रह ह', 4), ('खतम हन क', 4), ('करन क लए', 3), ('बधक क शव', 3), ('क शव क', 3), ('क मतबक इसरइल', 3), ('क मतबक टरप', 3), ('क बतचत क', 3)]

=== MR ===
Top 10 Bigrams:
[('अगणवड तई', 10), ('तई महणलय', 5), ('पषण आहर', 4), ('लडक बहण', 3), ('एक अगणवड', 3), ('तयचयवर टकणयत', 3), ('अगणवड तईन', 2), ('लकष दण', 2), ('कल जत', 2), ('गलय कह', 2)]

Top 10 Trigrams:
[('अगणवड तई महणलय', 4), ('एक अगणवड तई', 3), ('गलय कह महनयमधय', 2), ('लडक बहण यजनतलय', 2), ('असलयच अगणवड तई', 2), ('एकतमक बल वकस', 2), ('बल वकस सव', 2), ('वकस सव यजन', 2), ('अनक यजन रबवणयच', 2), ('आह दररज करयकरम', 2)]

=== GU ===
Top 10 Bigrams:
[('રષટરય સરરશ', 12), ('સરરશ ટક', 12), ('ટક હત', 11), ('સરરશ કરત', 9), ('ગજરતન સરરશ', 8), ('વદયરથઓન દખવ', 6), ('હત જમ', 5), ('ટકન રષટરય', 5), ('સમ ગજરતન', 5), ('આવય હત', 4)]
