<a href="https://colab.research.google.com/github/veer66/lingua-lab/blob/main/NLTK_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

NLTK - Examples

In [1]:
!pip install -U nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.9.1
    Uninstalling nltk-3.9.1:
      Successfully uninstalled nltk-3.9.1
Successfully installed nltk-3.9.2


In [2]:
!python -m nltk.downloader all

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

https://www.nltk.org/howto.html

In [3]:
from nltk import Nonterminal, nonterminals, Production, CFG
from nltk.parse import RecursiveDescentParser

In [4]:
grammar = CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> 'the' N | N PP | 'the' N PP
VP -> V NP | V PP | V NP PP
N -> 'cat'
N -> 'dog'
N -> 'rug'
V -> 'chased'
V -> 'sat'
P -> 'in'
P -> 'on'
""")

rd = RecursiveDescentParser(grammar)
sentence1 = 'the cat chased the dog'.split()

for t in rd.parse(sentence1):
  print(t)

(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))


In [5]:
from nltk.tag import PerceptronTagger
tagger = PerceptronTagger()

In [7]:
tagger.tag("Oh, it is great.".split())

[('Oh,', 'IN'), ('it', 'PRP'), ('is', 'VBZ'), ('great.', 'JJ')]

In [12]:
from nltk.stem.snowball import SnowballStemmer

In [9]:
stemmer = SnowballStemmer("english")

In [11]:
# studied

print(stemmer.stem("studied"))

studi


Lemmatizer

In [17]:
from nltk.stem import WordNetLemmatizer

In [20]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("mice",    # surface
                     "n",       # pos
)

'mouse'

In [56]:
def penn_tag_to_wordnet_tag(penn_tag):
  first_letter = penn_tag[0]

  match first_letter:
    case "V":
      return "v"
    case "J":
      return "a"
    case "R":
      return "r"
    case _:
      return "n"

print(penn_tag_to_wordnet_tag("VBZ"))

v


In [32]:
from nltk.tokenize import word_tokenize

word_tokenize("I saw a cat.")

['I', 'saw', 'a', 'cat', '.']

In [37]:
def convert_pos_to_wordnet_pos(annotated_sentence: list[tuple[str, str]]) -> list[tuple[str,str]]:
  converted_sentence = []

  for surface_form, pos in annotated_sentence:
    wordnet_pos = penn_tag_to_wordnet_tag(pos)
    converted_sentence.append((surface_form, wordnet_pos))

  return converted_sentence

print(tagger.tag(word_tokenize("I saw a cat.")))
print(convert_pos_to_wordnet_pos(tagger.tag(word_tokenize("I saw a cat."))))


[('I', 'PRP'), ('saw', 'VBD'), ('a', 'DT'), ('cat', 'NN'), ('.', '.')]
[('I', 'n'), ('saw', 'v'), ('a', 'n'), ('cat', 'n'), ('.', 'n')]


In [39]:
def lemmatize_sentence(annotated_sentence: list[tuple[str,str]]) -> list[tuple[str,str,str]]:
  lemmatized_sentence = []

  for surface_form, pos in annotated_sentence:
    lemma = lemmatizer.lemmatize(surface_form, pos)
    lemmatized_sentence.append((surface_form, pos, lemma))

  return lemmatized_sentence


lemmatize_sentence(convert_pos_to_wordnet_pos(tagger.tag(word_tokenize("I studied here."))))

[('I', 'n', 'I'),
 ('studied', 'v', 'study'),
 ('here', 'r', 'here'),
 ('.', 'n', '.')]

In [47]:
SENTENCE_FILENAME = "corpus.txt" # Need to upload separately

sentences = []

with open(SENTENCE_FILENAME) as f:
  for sentence in f.readlines():
    sentences.append(sentence)

In [57]:
lemmatized_sentences = []

for sentence in sentences:
  lemmatized_sentences.append(
      lemmatize_sentence(
        convert_pos_to_wordnet_pos(
          tagger.tag(
            word_tokenize(sentence)))))

print(lemmatized_sentences)

[[('Word', 'n', 'Word')], [('dis', 'n', 'dis')], [('dis-', 'n', 'dis-')], [('Dis', 'n', 'Dis')], [('disa', 'n', 'disa')], [('disability', 'n', 'disability')], [('disability', 'n', 'disability'), ('clause', 'n', 'clause')], [('disability', 'n', 'disability'), ('insurance', 'n', 'insurance')], [('disable', 'a', 'disable')], [('disabled', 'a', 'disabled')], [('disabled', 'a', 'disabled'), ('list', 'n', 'list')], [('disabled', 'a', 'disabled'), ('lists', 'n', 'list')], [('disablement', 'n', 'disablement')], [('disabler', 'n', 'disabler')], [('disabling', 'v', 'disable')], [('disabuse', 'n', 'disabuse')], [('disabused', 'v', 'disabuse')], [('disabuses', 'n', 'disabuses')], [('disabusing', 'v', 'disabuse')], [('disaccharidase', 'n', 'disaccharidase')], [('disaccharide', 'n', 'disaccharide')], [('disaccommodate', 'n', 'disaccommodate')], [('disaccord', 'n', 'disaccord')], [('disaccorded', 'v', 'disaccord')], [('disaccording', 'v', 'disaccord')], [('disaccords', 'n', 'disaccords')], [('disaccr

In [58]:
for sentence in lemmatized_sentences:
  print(sentence)

[('Word', 'n', 'Word')]
[('dis', 'n', 'dis')]
[('dis-', 'n', 'dis-')]
[('Dis', 'n', 'Dis')]
[('disa', 'n', 'disa')]
[('disability', 'n', 'disability')]
[('disability', 'n', 'disability'), ('clause', 'n', 'clause')]
[('disability', 'n', 'disability'), ('insurance', 'n', 'insurance')]
[('disable', 'a', 'disable')]
[('disabled', 'a', 'disabled')]
[('disabled', 'a', 'disabled'), ('list', 'n', 'list')]
[('disabled', 'a', 'disabled'), ('lists', 'n', 'list')]
[('disablement', 'n', 'disablement')]
[('disabler', 'n', 'disabler')]
[('disabling', 'v', 'disable')]
[('disabuse', 'n', 'disabuse')]
[('disabused', 'v', 'disabuse')]
[('disabuses', 'n', 'disabuses')]
[('disabusing', 'v', 'disabuse')]
[('disaccharidase', 'n', 'disaccharidase')]
[('disaccharide', 'n', 'disaccharide')]
[('disaccommodate', 'n', 'disaccommodate')]
[('disaccord', 'n', 'disaccord')]
[('disaccorded', 'v', 'disaccord')]
[('disaccording', 'v', 'disaccord')]
[('disaccords', 'n', 'disaccords')]
[('disaccredit', 'n', 'disaccredit')]