# Word-Sense Disambiguation


In [1]:
# !pip install nltk
from nltk.corpus import wordnet as wn
from nltk import wsd

# !pip install spacy
from spacy.cli import download
from spacy import load
# download('en_core_web_sm')
nlp = load('en_core_web_sm')
import warnings

from nltk import download
download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pi0322945\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
X = 'The die is cast.'
Y = 'Roll the die to get a 6.'
Z = 'What is dead may never die.'
print(wn.synsets('die'))
print(wn.synsets('die', pos=wn.NOUN))
for syn in wn.synsets('die', pos=wn.NOUN):
    print(syn.definition())
# for syn in wn.synsets('die', pos=wn.VERB):
#     print(syn.definition())

[Synset('die.n.01'), Synset('die.n.02'), Synset('die.n.03'), Synset('die.v.01'), Synset('die.v.02'), Synset('die.v.03'), Synset('fail.v.04'), Synset('die.v.05'), Synset('die.v.06'), Synset('die.v.07'), Synset('die.v.08'), Synset('die.v.09'), Synset('die.v.10'), Synset('die.v.11')]
[Synset('die.n.01'), Synset('die.n.02'), Synset('die.n.03')]
a small cube with 1 to 6 spots on the six faces; used in gambling to generate random numbers
a device used for shaping metal
a cutting tool that is fitted into a diestock and used for cutting male (external) screw threads on screws or bolts or pipes or rods


# Word-Sense Disambiguation with Lesk Algorithm
## Pass POS tag for better definition

In [15]:
print(X)
wsd.lesk(X.split(), 'die')
wsd.lesk(Y.split(), 'die').definition()

The die is cast.


'to be on base at the end of an inning, of a player'

In [5]:
wsd.lesk(X.split(), 'die', pos=wn.NOUN).definition()
print(Y)
wsd.lesk(Y.split(), 'die', pos=wn.NOUN).definition()
print(Z)
wsd.lesk(Z.split(), 'die').definition()
wsd.lesk(Z.split(), 'die', pos=wn.VERB).definition()


The die is cast.
Roll the die to get a 6.
What is dead may never die.


'stop operating or functioning'

# Automatic POS Tagging + Lesk with spaCy

In [6]:
POS_MAP = {
    'VERB': wn.VERB,
    'NOUN': wn.NOUN,
    'PROPN': wn.NOUN
}


def lesk(doc, word):
    found = False
    for token in doc:
        if token.text == word:
            word = token
            found = True
            break
    if not found:
        raise ValueError(f'Word \"{word}\" does not appear in the document: {doc.text}.')
    pos = POS_MAP.get(word.pos_, False)
    if not pos:
        warnings.warn(f'POS tag for {word.text} not found in wordnet. Falling back to default Lesk behaviour.')
    args = [c.text for c in doc], word.text
    kwargs = dict(pos=pos)
    return wsd.lesk(*args, **kwargs)

In [9]:
doc = nlp('Roll the die to get a 6.')
lesk(doc, 'die')

Synset('die.n.01')

In [10]:
lesk(doc, 'die').definition()

'a small cube with 1 to 6 spots on the six faces; used in gambling to generate random numbers'

In [11]:
lesk(nlp('I work at google.'), 'google').definition()

"a widely used search engine that uses text-matching techniques to find web pages that are important and relevant to a user's search"

In [12]:
lesk(nlp('I will google it.'), 'google').definition()

'search the internet (for information) using the Google search engine'

# Synsets


In [17]:
tractor = wn.synsets('tractor')
tractor
# Definitions of senses

[syn.definition() for syn in tractor]
# Hypernyms: Relation between a concept and its superordinate

tractor = wn.synset('tractor.n.01')
tractor.hypernyms()
self_propelled_vehicle = wn.synset('self-propelled_vehicle.n.01')
self_propelled_vehicle.hypernyms()
# Meronyms: Relation between a part and its whole

wheeled_vehicle = wn.synset('wheeled_vehicle.n.01')
wheeled_vehicle.part_meronyms()
# Hyponyms: Relation between a concept and its subordinate

wheeled_vehicle.hyponyms()
# Holonyms: Relation between whole and its parts

axle = wn.synset('axle.n.01')
axle.part_holonyms()
self_propelled_vehicle.hyponyms()
motor_vehicle = wn.synset('motor_vehicle.n.01')
motor_vehicle.hyponyms()
car = wn.synset('car.n.01')
car.part_meronyms()

[Synset('accelerator.n.01'),
 Synset('air_bag.n.01'),
 Synset('auto_accessory.n.01'),
 Synset('automobile_engine.n.01'),
 Synset('automobile_horn.n.01'),
 Synset('buffer.n.06'),
 Synset('bumper.n.02'),
 Synset('car_door.n.01'),
 Synset('car_mirror.n.01'),
 Synset('car_seat.n.01'),
 Synset('car_window.n.01'),
 Synset('fender.n.01'),
 Synset('first_gear.n.01'),
 Synset('floorboard.n.02'),
 Synset('gasoline_engine.n.01'),
 Synset('glove_compartment.n.01'),
 Synset('grille.n.02'),
 Synset('high_gear.n.01'),
 Synset('hood.n.09'),
 Synset('luggage_compartment.n.01'),
 Synset('rear_window.n.01'),
 Synset('reverse.n.02'),
 Synset('roof.n.02'),
 Synset('running_board.n.01'),
 Synset('stabilizer_bar.n.01'),
 Synset('sunroof.n.01'),
 Synset('tail_fin.n.02'),
 Synset('third_gear.n.01'),
 Synset('window.n.02')]

In [19]:
wheeled_vehicle.part_meronyms()


[Synset('axle.n.01'),
 Synset('brake.n.01'),
 Synset('splasher.n.01'),
 Synset('wheel.n.01')]

In [20]:
self_propelled_vehicle.hyponyms()


[Synset('armored_vehicle.n.01'),
 Synset('carrier.n.02'),
 Synset('forklift.n.01'),
 Synset('locomotive.n.01'),
 Synset('motor_vehicle.n.01'),
 Synset('personnel_carrier.n.01'),
 Synset('reconnaissance_vehicle.n.01'),
 Synset('recreational_vehicle.n.01'),
 Synset('streetcar.n.01'),
 Synset('tracked_vehicle.n.01'),
 Synset('tractor.n.01'),
 Synset('weapons_carrier.n.01')]

## analyse varying embeddings from different corpora 

In [3]:
from nltk.corpus import brown,movie_reviews

In [4]:
model_brown = Word2Vec(brown.sents(),sg=1)
model_movie = Word2Vec(movie_reviews.sents(),sg =1, window = 5)

NameError: name 'Word2Vec' is not defined

In [5]:
model_brown.wv.most_similar('money',topn= 5)

NameError: name 'model_brown' is not defined