In [1]:
import nltk
from nltk.corpus import wordnet
print(wordnet.synsets("computer"))
# definition and example of the word ‘computer’
print(wordnet.synset("computer.n.01").definition())

#examples
print("Examples:", wordnet.synset("computer.n.01").examples())

#get Antonyms
print(wordnet.lemma('buy.v.01.buy').antonyms())

[Synset('computer.n.01'), Synset('calculator.n.01')]
a machine for performing calculations automatically
Examples: []
[Lemma('sell.v.01.sell')]


In [2]:
import nltk
from nltk.corpus import wordnet
print(wordnet.synsets("computer"))
print(wordnet.synset("computer.n.01").lemma_names())

#all lemmas for each synset.
for e in wordnet.synsets("computer"):
  print(f'{e} --> {e.lemma_names()}')

#print all lemmas for a given synset
print(wordnet.synset('computer.n.01').lemmas())

#get the synset corresponding to lemma
print(wordnet.lemma('computer.n.01.computing_device').synset())

#Get the name of the lemma
print(wordnet.lemma('computer.n.01.computing_device').name())

#Hyponyms give abstract concepts of the word that are much more specific
#the list of hyponyms words of the computer
syn = wordnet.synset('computer.n.01')
print(syn.hyponyms)
print([lemma.name() for synset in syn.hyponyms() for lemma in synset.lemmas()])

#the semantic similarity in WordNet
vehicle = wordnet.synset('vehicle.n.01')
car = wordnet.synset('car.n.01')
print(car.lowest_common_hypernyms(vehicle))


[Synset('computer.n.01'), Synset('calculator.n.01')]
['computer', 'computing_machine', 'computing_device', 'data_processor', 'electronic_computer', 'information_processing_system']
Synset('computer.n.01') --> ['computer', 'computing_machine', 'computing_device', 'data_processor', 'electronic_computer', 'information_processing_system']
Synset('calculator.n.01') --> ['calculator', 'reckoner', 'figurer', 'estimator', 'computer']
[Lemma('computer.n.01.computer'), Lemma('computer.n.01.computing_machine'), Lemma('computer.n.01.computing_device'), Lemma('computer.n.01.data_processor'), Lemma('computer.n.01.electronic_computer'), Lemma('computer.n.01.information_processing_system')]
Synset('computer.n.01')
computing_device
<bound method _WordNetObject.hyponyms of Synset('computer.n.01')>
['analog_computer', 'analogue_computer', 'digital_computer', 'home_computer', 'node', 'client', 'guest', 'number_cruncher', 'pari-mutuel_machine', 'totalizer', 'totaliser', 'totalizator', 'totalisator', 'predi

In [3]:
from nltk.corpus import wordnet

# Retrieve synsets for the word "active"
# retrieves all the synsets (sets of cognitive synonyms) for the word "active". A synset contains a group of synonyms that share a common meaning. The result is a list of synsets.
print( wordnet.synsets("active"))

# Find antonyms for a specific lemma
print(wordnet.lemma('active.a.01.active').antonyms()) #This retrieves a specific lemma (a word form with a specific sense) from the synset active.a.01. Here, 'active' is the lemma name, 'a' stands for adjective, and '01' is the sense number.

[Synset('active_agent.n.01'), Synset('active_voice.n.01'), Synset('active.n.03'), Synset('active.a.01'), Synset('active.s.02'), Synset('active.a.03'), Synset('active.s.04'), Synset('active.a.05'), Synset('active.a.06'), Synset('active.a.07'), Synset('active.s.08'), Synset('active.a.09'), Synset('active.a.10'), Synset('active.a.11'), Synset('active.a.12'), Synset('active.a.13'), Synset('active.a.14')]
[Lemma('inactive.a.02.inactive')]


In [4]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

# Get synsets for 'football' and 'soccer'
syn1 = wordnet.synsets('football')
syn2 = wordnet.synsets('soccer')

# A word may have multiple synsets, so need to compare each synset of word1 with synset of word2
for s1 in syn1:
  for s2 in syn2:
    print("Path similarity of: ")
    print(s1, '(', s1.pos(), ')', '[', s1.definition(), ']') #Each synset has a name (e.g., Synset('football.n.01')), part of speech (e.g., noun 'n'), and a definition.
    print(s2, '(', s2.pos(), ')', '[', s2.definition(), ']')
    print(" is", s1.path_similarity(s2))
    print()

Path similarity of: 
Synset('football.n.01') ( n ) [ any of various games played with a ball (round or oval) in which two teams try to kick or carry or propel the ball into each other's goal ]
Synset('soccer.n.01') ( n ) [ a football game in which two teams of 11 players try to kick or head a ball into the opponents' goal ]
 is 0.5

Path similarity of: 
Synset('football.n.02') ( n ) [ the inflated oblong ball used in playing American football ]
Synset('soccer.n.01') ( n ) [ a football game in which two teams of 11 players try to kick or head a ball into the opponents' goal ]
 is 0.05



[nltk_data] Downloading package wordnet to C:\Users\Default.DESKTOP-
[nltk_data]     UBLD6TJ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

text = "Hello this is captain speaking."
text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if not word in
stopwords.words()]
print(tokens_without_sw)


#add the word 'play' to the NLTK stop word collection
all_stopwords = stopwords.words('english')
all_stopwords.append('play')
text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
print(tokens_without_sw)


#remove ‘not’ from stop word collection
all_stopwords.remove('not')
text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
print(tokens_without_sw)

['Hello', 'captain', 'speaking', '.']
['Hello', 'captain', 'speaking', '.']
['Hello', 'captain', 'speaking', '.']


[nltk_data] Downloading package punkt to C:\Users\Default.DESKTOP-
[nltk_data]     UBLD6TJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Default.DESKTOP-
[nltk_data]     UBLD6TJ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords
text = "Hello this is not captain speaking."
print('Original text: ', text)
filtered_sentence = remove_stopwords(text)
print('\n After removing Stop words: ',filtered_sentence)

#The below line retrieves the default set of stop words from Gensim and prints them.
all_stopwords = gensim.parsing.preprocessing.STOPWORDS
print('\n Stop words in Gensim: ', all_stopwords)

#'''The following script adds likes and play to the list of stop words in Gensim:'''
from gensim.parsing.preprocessing import STOPWORDS
all_stopwords_gensim = STOPWORDS.union(set(['this', 'captain'])) # adding 'likes' and 'play' to stop words

text = "Hello this is captain speaking."
text_tokens = word_tokenize(text)

# Filter out the tokens that are in the new stop words set (including 'likes' and 'play')
tokens_without_sw = [word for word in text_tokens if not word in
all_stopwords_gensim]
print("\n After adding this & captain in stop word collection: ",tokens_without_sw)


Original text:  Hello this is not captain speaking.

 After removing Stop words:  Hello captain speaking.

 Stop words in Gensim:  frozenset({'least', 'even', 'sometimes', 'well', 'with', 'bottom', 'too', 'this', 'now', 'if', 'him', 'once', 'everywhere', 'take', 'has', 'a', 'see', 'it', 'seeming', 'put', 'you', 'becoming', 'down', 'against', 'part', 'six', 'just', 'its', 'become', 'nothing', 'other', 'ourselves', 'however', 'of', 'been', 'mill', 'every', 'neither', 'call', 'towards', 'didn', 'somehow', 'less', 'from', 'several', 'still', 'alone', 'have', 'he', 'besides', 'get', 'via', 'their', 'much', 'hundred', 'either', 'another', 'whereas', 'beforehand', 'ours', 'whom', 'really', 'our', 'always', 'km', 'first', 'for', 'off', 'sincere', 'hereby', 'as', 'everything', 'con', 'under', 'wherein', 'whose', 'whereafter', 'also', 'very', 'former', 'formerly', 'which', 'thereupon', 'more', 'latterly', 'cry', 'sixty', 'those', 'meanwhile', 'beyond', 'on', 'throughout', 'others', 'in', 'being'

In [4]:
# Remove the word 'not' from the existing set of Gensim stop words
from gensim.parsing.preprocessing import STOPWORDS
all_stopwords_gensim = STOPWORDS
sw_list = {"not"}

all_stopwords_gensim = STOPWORDS.difference(sw_list)
text = "Hello this is not captain speaking."
text_tokens = word_tokenize(text)

# Filter out the tokens again with 'not' removed from stop words set
tokens_without_sw = [word for word in text_tokens if not word in
all_stopwords_gensim]
print(tokens_without_sw)

NameError: name 'word_tokenize' is not defined

In [5]:
! pip install SpaCy
! python -m spacy download en

Collecting SpaCy
  Obtaining dependency information for SpaCy from https://files.pythonhosted.org/packages/92/fb/d1f0605e1e8627226c6c96053fe1632e9a04a3fbcd8b5d715528cb95eb97/spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata
  Downloading spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from SpaCy)
  Obtaining dependency information for spacy-legacy<3.1.0,>=3.0.11 from https://files.pythonhosted.org/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from SpaCy)
  Obtaining dependency information for spacy-loggers<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl.metadata
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting 

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 1.7 MB/s eta 0:00:08
     --------------------------------------- 0.1/12.8 MB 991.0 kB/s eta 0:00:13
     ---------------------------------------- 0.1/12.8 MB 1.1 MB/s eta 0:00:12
      --------------------------------------- 0.2/12.8 MB 1.3 MB/s eta 0:00:11
      --------------------------------------- 0.3/12.8 MB 1.4 MB/s eta 0:00:10
      --------------------------------------- 0.3/12.8 MB 1.4 MB/s eta 0:00:10
      -------------------------------------- 0.3/12.8 MB 951.8 kB/s eta 0:00:14
      -------------------------------------- 0.3/12.8 MB 884.2 kB/s eta 0:00:15
     - ------------------------------------- 0.3/12.8 MB 840.2 kB/s eta 0:00:15
     - ------------------------------

In [6]:
import spacy
import nltk
from nltk.tokenize import word_tokenize
sp = spacy.load('en_core_web_sm')

#add the word play to the NLTK stop word collection
all_stopwords = sp.Defaults.stop_words
all_stopwords.add("is")
text = "Hello this is not captain speaking."
text_tokens = word_tokenize(text)
tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
print(tokens_without_sw)

#remove 'not' from stop word collection
all_stopwords.remove('not')
tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
print(tokens_without_sw)

['Hello', 'captain', 'speaking', '.']
['Hello', 'not', 'captain', 'speaking', '.']
