In [1]:
import nltk
from nltk.corpus import stopwords, names, wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import random

In [16]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('names')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
print(nltk.corpus.__dir__())

['_LazyModule__lazymodule_locals', '_LazyModule__lazymodule_globals', '__name__', '_LazyModule__lazymodule_name', '_LazyModule__lazymodule_init', '__module__', '__doc__', '_LazyModule__lazymodule_loaded', '__init__', '_LazyModule__lazymodule_import', '__getattr__', '__setattr__', '__repr__', '__dict__', '__weakref__', '__new__', '__hash__', '__str__', '__getattribute__', '__delattr__', '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', '__reduce_ex__', '__reduce__', '__getstate__', '__subclasshook__', '__init_subclass__', '__format__', '__sizeof__', '__dir__', '__class__']


In [9]:
print(stopwords.fileids())

['albanian', 'arabic', 'azerbaijani', 'basque', 'belarusian', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'tamil', 'turkish']


In [10]:
language = 'english'
print(stopwords.words(language)[:10])

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']


In [11]:
def remove_stopwords(text, lang='english'):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stopwords.words(lang)]
    return ' '.join(filtered_words)

sample_text = "This is a simple text with some stopwords."
print(remove_stopwords(sample_text))

simple text stopwords .


In [12]:
def filter_stopwords(word_list, lang='english'):
    return [word for word in word_list if word.lower() not in stopwords.words(lang)]

words = ["this", "is", "an", "example", "of", "stopwords", "removal"]
print(filter_stopwords(words))

['example', 'stopwords', 'removal']


In [13]:
def get_wordnet_info(word):
    synsets = wordnet.synsets(word)
    for syn in synsets:
        print(f"{syn.name()} - {syn.definition()}")
        if syn.examples():
            print(f"  Examples: {syn.examples()}")

get_wordnet_info("computer")

computer.n.01 - a machine for performing calculations automatically
calculator.n.01 - an expert at calculation (or at operating calculating machines)


In [14]:
def get_synonyms_antonyms(word):
    synonyms, antonyms = set(), set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
            if lemma.antonyms():
                antonyms.add(lemma.antonyms()[0].name())
    return synonyms, antonyms

print(get_synonyms_antonyms("good"))

({'practiced', 'commodity', 'beneficial', 'expert', 'safe', 'serious', 'sound', 'salutary', 'adept', 'dear', 'trade_good', 'in_effect', 'proficient', 'secure', 'dependable', 'honorable', 'skilful', 'right', 'ripe', 'effective', 'in_force', 'unspoilt', 'good', 'well', 'respectable', 'thoroughly', 'just', 'skillful', 'estimable', 'honest', 'soundly', 'undecomposed', 'full', 'goodness', 'unspoiled', 'near', 'upright'}, {'bad', 'evil', 'evilness', 'badness', 'ill'})


In [17]:
def pos_tagging(text):
    words = word_tokenize(text)
    return pos_tag(words)

print(pos_tagging("NLTK is a powerful library for NLP"))


[('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('powerful', 'JJ'), ('library', 'NN'), ('for', 'IN'), ('NLP', 'NNP')]


In [18]:
def get_similarity(noun1, noun2):
    syn1 = wordnet.synsets(noun1, pos=wordnet.NOUN)
    syn2 = wordnet.synsets(noun2, pos=wordnet.NOUN)
    if syn1 and syn2:
        return syn1[0].wup_similarity(syn2[0])
    return None

print(get_similarity("cat", "dog"))

0.8571428571428571


In [19]:
print(get_similarity("run", "jog"))

0.23529411764705882


In [20]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
print(f"Male names: {len(male_names)}, Female names: {len(female_names)}")

Male names: 2943, Female names: 5001


In [21]:
sampled_names = random.sample(male_names, 8) + random.sample(female_names, 7)
random.shuffle(sampled_names)
for name in sampled_names:
    print(f"{name}: {'Male' if name in male_names else 'Female'}")

Gianna: Female
Dorelia: Female
Cassaundra: Female
Avi: Male
Eliott: Male
Karsten: Male
Arabella: Female
Nettie: Female
Filmore: Male
Ambros: Male
Liam: Male
Magnum: Male
Tamera: Female
Melly: Female
Thatch: Male


In [22]:
def extract_last_letter(names_list):
    return {name: name[-1] for name in names_list}

print(extract_last_letter(["John", "Emily", "Alex", "Sophie"]))

{'John': 'n', 'Emily': 'y', 'Alex': 'x', 'Sophie': 'e'}
