In [4]:


import nltk
from nltk.corpus import gutenberg, brown, inaugural, reuters, webtext, wordnet

# Download the necessary NLTK data files
nltk.download('gutenberg')
nltk.download('brown')
nltk.download('inaugural')
nltk.download('reuters')
nltk.download('webtext')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Download additional WordNet data

# Function to get the first 20 words of a corpus
def get_first_20_words(corpus):
    words = corpus.words()
    return words[:20]

# Function to get the first 20 words of a specific category in a corpus
def get_first_20_words_of_category(corpus, category):
    if corpus == brown:
        words = brown.words(categories=category)
    elif corpus == reuters:
        words = reuters.words(categories=category)
    else:
        raise ValueError("Unsupported corpus or category.")
    return words[:20]

# Gutenberg corpus
gutenberg_words = get_first_20_words(gutenberg)
print("Gutenberg corpus first 20 words:")
print(gutenberg_words)

# Brown corpus
brown_words = get_first_20_words(brown)
print("\nBrown corpus first 20 words:")
print(brown_words)

# Inaugural corpus
inaugural_words = get_first_20_words(inaugural)
print("\nInaugural corpus first 20 words:")
print(inaugural_words)

# Reuters corpus
reuters_words = get_first_20_words(reuters)
print("\nReuters corpus first 20 words:")
print(reuters_words)

# WebText corpus
webtext_words = get_first_20_words(webtext)
print("\nWebText corpus first 20 words:")
print(webtext_words)

# Accessing specific categories
# Example categories: 'news' for Brown corpus, 'grain' for Reuters corpus
brown_news_words = get_first_20_words_of_category(brown, 'news')
print("\nBrown corpus 'news' category first 20 words:")
print(brown_news_words)

reuters_grain_words = get_first_20_words_of_category(reuters, 'grain')
print("\nReuters corpus 'grain' category first 20 words:")
print(reuters_grain_words)

# Accessing WordNet
def get_first_20_synsets():
    synsets = list(wordnet.all_synsets())
    return synsets[:20]

wordnet_synsets = get_first_20_synsets()
print("\nWordNet first 20 synsets:")
for synset in wordnet_synsets:
    print(synset.name(), synset.definition())




[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Gutenberg corpus first 20 words:
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich']

Brown corpus first 20 words:
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that']

Inaugural corpus first 20 words:
['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':', 'Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no']

Reuters corpus first 20 words:
['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.']

WebText corpus first 20 words:
['Cookie', 'Manager', ':', '"', 'Don', "'", 't', 'allow', 'sites', 'that', 'set', 'removed', 'cookies', 'to', 'set', 'future', 'cookies', '"', 'should', 'stay']

Br