# Lemmatization/WordNet

# Task 0. 
Execute the notebook and complete listed exercises (between CODE_START and CODE_END blocks).

## Preparation

In [1]:
import nltk

#nltk.download('twitter_samples')

In [2]:
from nltk.corpus import twitter_samples

In [3]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

positive_tweets[27]

'Spiritual Ritual Festival (Népal)\nBeginning of Line-up :)\nIt is left for the line-up (y)\nSee more at:... http://t.co/QMNz62OEuc'

In [4]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(tweet_tokens[27])

['Spiritual', 'Ritual', 'Festival', '(', 'Népal', ')', 'Beginning', 'of', 'Line-up', ':)', 'It', 'is', 'left', 'for', 'the', 'line-up', '(', 'y', ')', 'See', 'more', 'at', ':', '...', 'http://t.co/QMNz62OEuc']


In [5]:
#nltk.download('averaged_perceptron_tagger_eng')
from nltk.tag import pos_tag

In [6]:
pos_tag(tweet_tokens[27])

[('Spiritual', 'JJ'),
 ('Ritual', 'NNP'),
 ('Festival', 'NNP'),
 ('(', '('),
 ('Népal', 'NNP'),
 (')', ')'),
 ('Beginning', 'NNP'),
 ('of', 'IN'),
 ('Line-up', 'NNP'),
 (':)', 'NNP'),
 ('It', 'PRP'),
 ('is', 'VBZ'),
 ('left', 'VBN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('line-up', 'NN'),
 ('(', '('),
 ('y', 'NN'),
 (')', ')'),
 ('See', 'VB'),
 ('more', 'JJR'),
 ('at', 'IN'),
 (':', ':'),
 ('...', ':'),
 ('http://t.co/QMNz62OEuc', 'NN')]

## WordNet

In [7]:
#nltk.download('wordnet')

## Synonyms

In [8]:
from nltk.corpus import wordnet as wn

word_synset = wn.synsets("car")
word_synset_b = wn.synsets("borsch")
print(f"synsets:, {word_synset}------------{word_synset_b}")
print(f"lemma names:, {word_synset[0].lemma_names()}------------{word_synset_b[0].lemma_names()}")

synsets:, [Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]------------[Synset('borsch.n.01')]
lemma names:, ['car', 'auto', 'automobile', 'machine', 'motorcar']------------['borsch', 'borsh', 'borscht', 'borsht', 'borshch', 'bortsch']


In [9]:
word_synset[0].definition(), word_synset_b[0].definition()

('a motor vehicle with four wheels; usually propelled by an internal combustion engine',
 'a Russian or Polish soup usually containing beet juice as a foundation')

In [10]:
word_synset[0].examples(), word_synset_b[0].examples()

(['he needs a car to get to work'], [])

In [11]:
word_synset[1].definition()

'a wheeled vehicle adapted to the rails of railroad'

In [12]:
word_synset[1].examples()

['three cars had jumped the rails']

![Alt text](hypernyms-hyponyms-explained-image-a.png)

## Hyponyms

In [13]:
word_synset[0].hyponyms()

[Synset('beach_wagon.n.01'),
 Synset('coupe.n.01'),
 Synset('pace_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('jeep.n.01'),
 Synset('electric.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('compact.n.03'),
 Synset('hot_rod.n.01'),
 Synset('cruiser.n.01'),
 Synset('hatchback.n.01'),
 Synset('sedan.n.01'),
 Synset('stock_car.n.01'),
 Synset('sports_car.n.01'),
 Synset('cab.n.03'),
 Synset('racer.n.02'),
 Synset('hardtop.n.01'),
 Synset('model_t.n.01'),
 Synset('minivan.n.01'),
 Synset('limousine.n.01'),
 Synset('used-car.n.01'),
 Synset('bus.n.04'),
 Synset('sport_utility.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('ambulance.n.01'),
 Synset('roadster.n.01'),
 Synset('convertible.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01')]

## Hypernyms

In [14]:
word_synset[0].hypernyms()

[Synset('motor_vehicle.n.01')]

In [15]:
tree = wn.synsets("tree")[0]
paths = tree.hypernym_paths()
for p in paths:
  print([synset.name() for synset in p])

['entity.n.01', 'physical_entity.n.01', 'object.n.01', 'whole.n.02', 'living_thing.n.01', 'organism.n.01', 'plant.n.02', 'vascular_plant.n.01', 'woody_plant.n.01', 'tree.n.01']


## Meronyms

In [16]:
tree.part_meronyms()

[Synset('burl.n.02'),
 Synset('trunk.n.01'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('crown.n.07')]

In [17]:
tree.substance_meronyms()

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

## Holonyms

In [18]:
tree.member_holonyms()

[Synset('forest.n.01')]

![Alt text](2.avif)

## Lemmatization function

In [19]:
from nltk.stem.wordnet import WordNetLemmatizer
tokens = tweet_tokens[50]

In [20]:
# Create a lemmatizer
lemmatizer = WordNetLemmatizer()

In [21]:
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_sentence(tokens):
  lemmatized_sentence = []

  # CODE_START
  for token in tokens:
      lemma = lemmatizer.lemmatize(token)
      lemmatized_sentence.append(lemma) 
  # CODE_END

  return lemmatized_sentence
tokens = ["The", "children", "were", "driving", "cars"]
lemmatize_sentence(tokens)

['The', 'child', 'were', 'driving', 'car']

## Processing

In [23]:
#nltk.download('stopwords')

In [24]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(len(stop_words))
for i in range(10):
    print(stop_words[i])

198
a
about
above
after
again
against
ain
all
am
an


In [25]:
import re, string
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
def process_tokens(tweet_tokens):
    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(tag):
        if tag.startswith('NN'):
            return wordnet.NOUN
        elif tag.startswith('VB'):
            return wordnet.VERB
        elif tag.startswith('JJ'):
            return wordnet.ADJ
        elif tag.startswith('RB'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # Default to noun

    for token, tag in pos_tag(tweet_tokens):
        token_lower = token.lower()
        # Delete URLs and mentions
        if re.match(r'^https?://', token_lower) or token_lower.startswith('@'):
            continue
        # Delete stop words and punctuation
        if token_lower in stop_words or token_lower in string.punctuation:
            continue
        # Lemmatize the token
        pos = get_wordnet_pos(tag)
        lemmatized_token = lemmatizer.lemmatize(token_lower, pos)
        cleaned_tokens.append(lemmatized_token)

    return cleaned_tokens

In [27]:
print("Before:", tweet_tokens[50])
print("After:", process_tokens(tweet_tokens[50]))

Before: ['@groovinshawn', 'they', 'are', 'rechargeable', 'and', 'it', 'normally', 'comes', 'with', 'a', 'charger', 'when', 'u', 'buy', 'it', ':)']
After: ['rechargeable', 'normally', 'come', 'charger', 'u', 'buy', ':)']


In [28]:
# CODE_START

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = [process_tokens(tokens) for tokens in positive_tweet_tokens]
negative_cleaned_tokens_list = [process_tokens(tokens) for tokens in negative_tweet_tokens]

# CODE_END

In [29]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


In [30]:
def get_all_words(cleaned_tokens_list):
  # CODE_START
  for tokens in cleaned_tokens_list:
    for token in tokens:
      yield token
  # CODE_END
all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [31]:
from nltk import FreqDist

# CODE_START
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)

# Виведемо 10 найпоширеніших слів
print(freq_dist_pos.most_common(10))
# CODE_END

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 383), ('follow', 362), ('u', 360), ('love', 336), ('...', 290), ('get', 269), ('good', 261)]


# Task 1. 
Change the code so it removes hashtags during pre-processing. (E.g. #Ukraine).

In [44]:
def process_tokens_no_tags(tweet_tokens):
    cleaned_tokens = []
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    def get_wordnet_pos(tag):
        if tag.startswith('NN'):
            return wordnet.NOUN
        elif tag.startswith('VB'):
            return wordnet.VERB
        elif tag.startswith('JJ'):
            return wordnet.ADJ
        elif tag.startswith('RB'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # Default to noun

    for token, tag in pos_tag(tweet_tokens):
        token_lower = token.lower()
        # Delete URLs and mentions
        if re.match(r'^https?://', token_lower) or token_lower.startswith('@') or token_lower.startswith('#'): # Added condition to remove hashtags
            continue
        # Delete stop words and punctuation
        if token_lower in stop_words or token_lower in string.punctuation:
            continue
        # Lemmatize the token
        pos = get_wordnet_pos(tag)
        lemmatized_token = lemmatizer.lemmatize(token_lower, pos)
        cleaned_tokens.append(lemmatized_token)

    return cleaned_tokens

print("Process Tokens No Tags:", process_tokens_no_tags(tweet_tokens[50]))

Process Tokens No Tags: ['rechargeable', 'normally', 'come', 'charger', 'u', 'buy', ':)']


# Task 2. 
Modify process_tokens() so that instead of using lemmatizer.lemmatize(), it will use WordNet synsets.

In [45]:
def process_tokens_WordNet(tweet_tokens):
    cleaned_tokens = []
    stop_words = stopwords.words('english')

    def get_wordnet_pos(tag):
        if tag.startswith('NN'):
            return wordnet.NOUN
        elif tag.startswith('VB'):
            return wordnet.VERB
        elif tag.startswith('JJ'):
            return wordnet.ADJ
        elif tag.startswith('RB'):
            return wordnet.ADV
        else:
            return wordnet.NOUN  # Default to noun

    for token, tag in pos_tag(tweet_tokens):
        token_lower = token.lower()

        # Remove URLs, mentions, hashtags
        if re.match(r'^https?://', token_lower) or token_lower.startswith('@') or token_lower.startswith('#'):
            continue

        # Remove stopwords and punctuation
        if token_lower in stop_words or token_lower in string.punctuation:
            continue

        # Get POS for WordNet
        pos = get_wordnet_pos(tag)

        # Try getting the base lemma from WordNet synsets
        synsets = wordnet.synsets(token_lower, pos=pos)
        if synsets:
            # Get the first lemma name of the first synset
            base_form = synsets[0].lemmas()[0].name()
        else:
            base_form = token_lower  # fallback if no synset found

        cleaned_tokens.append(base_form)

    return cleaned_tokens

print("Process Tokens Wordnet:", process_tokens_WordNet(tweet_tokens[50]))

Process Tokens Wordnet: ['rechargeable', 'normally', 'come', 'charger', 'u', 'buy', ':)']


# Task 3. 
Let’s suppose that semantic distance between words is the distance to the common semantic parent (hypernym). Write a function that will compute this distance between two words.

In [None]:
from nltk.corpus import wordnet as wn

def semantic_distance(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)

    if not synsets1 or not synsets2:
        return float('no match')  # If word not found in WordNet

    syn1 = synsets1[0]
    syn2 = synsets2[0]

    # Find the lowest common hypernym
    common_hypernyms = syn1.lowest_common_hypernyms(syn2)

    if not common_hypernyms:
        return float('no common hypernyms')  # If no common hypernym is found

    common_hyper = common_hypernyms[0]

    # Calculate the distance from each synset to the common hypernym
    dist1 = syn1.shortest_path_distance(common_hyper)
    dist2 = syn2.shortest_path_distance(common_hyper)

    if dist1 is None or dist2 is None:
        return float('inf')

    return dist1 + dist2
print(f"Semantic Distance: {semantic_distance('cat','car')}")

Semantic Distance: 17
