### Tokenization

In [3]:
from nltk.tokenize import sent_tokenize
import nltk
# nltk.download('punkt')

myself = "Hi, my name is Deepak Mittal and I am 24 years old. \
          I currently work as a Data Scientist and I am originally from India. \
          In my free time, I enjoy reading books. It's nice to meet you!"

sent_tokenize(myself)

['Hi, my name is Deepak Mittal and I am 24 years old.',
 'I currently work as a Data Scientist and I am originally from India.',
 'In my free time, I enjoy reading books.',
 "It's nice to meet you!"]

In [4]:
from nltk.tokenize import word_tokenize

word_tokenize(sent_tokenize(myself)[0])

['Hi',
 ',',
 'my',
 'name',
 'is',
 'Deepak',
 'Mittal',
 'and',
 'I',
 'am',
 '24',
 'years',
 'old',
 '.']

In [5]:
from nltk.tokenize import regexp_tokenize

text = "I can't give up!"
pattern = "[\w']+"
regexp_tokenize(text = text,pattern=pattern)

['I', "can't", 'give', 'up']

In [6]:
whitespace_pattern = "\s+"
text = "I can't give up!"
regexp_tokenize(text = text,pattern=pattern)

['I', "can't", 'give', 'up']

In [14]:
# Training a custom tokenizer

from nltk.tokenize import PunktSentenceTokenizer #uses unsupervised approach
from nltk.corpus import webtext

# nltk.download('webtext')
# text = webtext.raw('./chatgpt-generated-text.txt')
with open('./chatgpt-generated-text.txt', 'r') as fp:
    text = fp.read()

sent_tokenizer = PunktSentenceTokenizer(text)
sent_tokenizer.tokenize(text)[:5]

["Girl: Hey, how's it going?",
 'Boy: Not too bad, how about you?',
 "Girl: I'm good, thanks for asking.",
 'What have you been up to lately?',
 'Boy: Not much, just working and hanging out with friends.']

Stopwords

In [16]:
from nltk.corpus import stopwords

# nltk.download('stopwords')
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [18]:
stopwords.words('hinglish')[:20]

['a',
 'aadi',
 'aaj',
 'aap',
 'aapne',
 'aata',
 'aati',
 'aaya',
 'aaye',
 'ab',
 'abbe',
 'abbey',
 'abe',
 'abhi',
 'able',
 'about',
 'above',
 'accha',
 'according',
 'accordingly']

### Synsets

In [40]:
from nltk.corpus import wordnet
# nltk.download('wordnet')

word = 'machine'
syn = wordnet.synsets(word)
syn

[Synset('machine.n.01'),
 Synset('machine.n.02'),
 Synset('machine.n.03'),
 Synset('machine.n.04'),
 Synset('machine.n.05'),
 Synset('car.n.01'),
 Synset('machine.v.01'),
 Synset('machine.v.02')]

In [46]:
for i in range(len(syn)):
    print(i,syn[i].examples())

0 []
1 ['the boxer was a magnificent fighting machine']
2 ['the war machine']
3 []
4 ['he was endorsed by the Democratic machine']
5 ['he needs a car to get to work']
6 []
7 ['The Americans were machining while others still hand-made cars']


In [43]:
syn[0].name()

'machine.n.01'

In [49]:
for i in range(len(syn)):
    print(i,'---',syn[i],'---',syn[i].definition())

0 --- Synset('machine.n.01') --- any mechanical or electrical device that transmits or modifies energy to perform or assist in the performance of human tasks
1 --- Synset('machine.n.02') --- an efficient person
2 --- Synset('machine.n.03') --- an intricate organization that accomplishes its goals efficiently
3 --- Synset('machine.n.04') --- a device for overcoming resistance at one point by applying force at some other point
4 --- Synset('machine.n.05') --- a group that controls the activities of a political party
5 --- Synset('car.n.01') --- a motor vehicle with four wheels; usually propelled by an internal combustion engine
6 --- Synset('machine.v.01') --- turn, shape, mold, or otherwise finish by machinery
7 --- Synset('machine.v.02') --- make by machinery


In [50]:
syn[0].pos

<bound method Synset.pos of Synset('machine.n.01')>

In [51]:
wordnet.synsets('computer', pos='n')

[Synset('computer.n.01'), Synset('calculator.n.01')]

In [53]:
syn[0].lemmas()

[Lemma('machine.n.01.machine')]

In [62]:
# all synonyms of a word
for s in wordnet.synsets('computer'):
    # print(l.lemmas())
    for l in s.lemmas():
        print(l.name())

computer
computing_machine
computing_device
data_processor
electronic_computer
information_processing_system
calculator
reckoner
figurer
estimator
computer


In [67]:
# all antonyms of a word
for s in wordnet.synsets('smooth'):
    for l in s.lemmas():
        for a in l.antonyms():
            print(a.name())

roughen
rough
rough
staccato
rough


In [69]:
## Word similarity

w1 = wordnet.synset('computer.n.01')
w2 = wordnet.synset('machine.n.01')

w1.wup_similarity(w2)

0.9411764705882353