In [5]:
import nltk
from nltk.corpus import gutenberg, brown, inaugural, reuters, webtext, wordnet

# Download the necessary NLTK data files
nltk.download('gutenberg')
nltk.download('brown')
nltk.download('inaugural')
nltk.download('reuters')
nltk.download('webtext')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')  # Download additional WordNet data

# Function to get the first 20 words of a corpus
def get_first_20_words(corpus):
    words = corpus.words()
    return words[:20]

# Function to get the first 20 words of a specific category in a corpus
def get_first_20_words_of_category(corpus, category):
    if corpus == brown:
        words = brown.words(categories=category)
    elif corpus == reuters:
        words = reuters.words(categories=category)
    else:
        raise ValueError("Unsupported corpus or category.")
    return words[:20]

# Function to perform lexical analysis
def lexical_analysis(words):
    # Tokenization
    tokens = nltk.word_tokenize(" ".join(words))
    # POS tagging
    pos_tags = nltk.pos_tag(tokens)
    return pos_tags

# Gutenberg corpus
gutenberg_words = get_first_20_words(gutenberg)
print("Gutenberg corpus first 20 words:")
print(gutenberg_words)
print("Lexical Analysis of Gutenberg corpus first 20 words:")
print(lexical_analysis(gutenberg_words))

# Brown corpus
brown_words = get_first_20_words(brown)
print("\nBrown corpus first 20 words:")
print(brown_words)
print("Lexical Analysis of Brown corpus first 20 words:")
print(lexical_analysis(brown_words))

# Inaugural corpus
inaugural_words = get_first_20_words(inaugural)
print("\nInaugural corpus first 20 words:")
print(inaugural_words)
print("Lexical Analysis of Inaugural corpus first 20 words:")
print(lexical_analysis(inaugural_words))

# Reuters corpus
reuters_words = get_first_20_words(reuters)
print("\nReuters corpus first 20 words:")
print(reuters_words)
print("Lexical Analysis of Reuters corpus first 20 words:")
print(lexical_analysis(reuters_words))

# WebText corpus
webtext_words = get_first_20_words(webtext)
print("\nWebText corpus first 20 words:")
print(webtext_words)
print("Lexical Analysis of WebText corpus first 20 words:")
print(lexical_analysis(webtext_words))

# Accessing specific categories
# Example categories: 'news' for Brown corpus, 'grain' for Reuters corpus
brown_news_words = get_first_20_words_of_category(brown, 'news')
print("\nBrown corpus 'news' category first 20 words:")
print(brown_news_words)
print("Lexical Analysis of Brown corpus 'news' category first 20 words:")
print(lexical_analysis(brown_news_words))

reuters_grain_words = get_first_20_words_of_category(reuters, 'grain')
print("\nReuters corpus 'grain' category first 20 words:")
print(reuters_grain_words)
print("Lexical Analysis of Reuters corpus 'grain' category first 20 words:")
print(lexical_analysis(reuters_grain_words))

# Accessing WordNet
def get_first_20_synsets():
    synsets = list(wordnet.all_synsets())
    return synsets[:20]

wordnet_synsets = get_first_20_synsets()
print("\nWordNet first 20 synsets:")
for synset in wordnet_synsets:
    print(synset.name(), synset.definition())

# Example of lexical analysis with WordNet definitions
wordnet_words = [synset.definition().split()[0] for synset in wordnet_synsets if synset.definition()]
print("\nLexical Analysis of WordNet definitions first words:")
print(lexical_analysis(wordnet_words))

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4

Gutenberg corpus first 20 words:
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich']
Lexical Analysis of Gutenberg corpus first 20 words:
[('[', 'NNS'), ('Emma', 'NNP'), ('by', 'IN'), ('Jane', 'NNP'), ('Austen', 'NNP'), ('1816', 'CD'), (']', 'NNP'), ('VOLUME', 'NNP'), ('I', 'PRP'), ('CHAPTER', 'VBP'), ('I', 'PRP'), ('Emma', 'NNP'), ('Woodhouse', 'NNP'), (',', ','), ('handsome', 'NN'), (',', ','), ('clever', 'NN'), (',', ','), ('and', 'CC'), ('rich', 'JJ')]

Brown corpus first 20 words:
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']
Lexical Analysis of Brown corpus first 20 words:
[('The', 'DT'), ('Fulton', 'NNP'), ('County', 'NNP'), ('Grand', 'NNP'), ('Jury', 'NNP'), ('said', 'VBD'), ('Friday', 'NNP'), ('an', 'DT'), ('investigation', 'NN'), ('of'