# **Question:1 Use the inaugural address corpus to find the total number of words and the total number of unique words in the inaugural addresses delivered in the 21st century**

In [None]:
import nltk
from nltk.corpus import inaugural

# Ensure the necessary NLTK datasets are downloaded
nltk.download('inaugural')

# Function to filter the addresses from the 21st century
def get_21st_century_addresses():
    # List of files in the inaugural corpus
    files = inaugural.fileids()

    # Filter for 21st century addresses (2001 and onwards)
    century_21_files = [file for file in files if int(file.split('-')[0]) >= 2001]

    return century_21_files

# Get the 21st-century inaugural addresses
century_21_files = get_21st_century_addresses()

# List to store all words from the 21st century addresses
all_words = []

# Iterate through each 21st century address and extract words
for file in century_21_files:
    words = inaugural.words(file)
    all_words.extend(words)

# Calculate the total number of words
total_words = len(all_words)

# Calculate the total number of unique words
unique_words = set(all_words)
total_unique_words = len(unique_words)

# Print the results
print(f"Total number of words in 21st-century inaugural addresses: {total_words}")
print(f"Total number of unique words in 21st-century inaugural addresses: {total_unique_words}")


[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Unzipping corpora/inaugural.zip.


Total number of words in 21st-century inaugural addresses: 14093
Total number of unique words in 21st-century inaugural addresses: 2494


# **Question:2 Write a Python program to find the frequency distribution of the words "democracy", "freedom", "liberty", and "equality" in all inaugural addresses using NLTK**

In [None]:
import nltk
from nltk.corpus import inaugural
from nltk.probability import FreqDist

# Ensure the necessary NLTK datasets are downloaded
nltk.download('inaugural')

# List of target words to track frequency for
target_words = ["democracy", "freedom", "liberty", "equality"]

# List to store all words from all inaugural addresses
all_words = []

# Iterate through all inaugural addresses
for file in inaugural.fileids():
    # Get the words in the current address
    words = inaugural.words(file)
    # Add the words to the all_words list
    all_words.extend(words)

# Convert all words to lowercase for case-insensitive matching
all_words = [word.lower() for word in all_words]

# Create a frequency distribution of all words
fdist = FreqDist(all_words)

# Find the frequency of the target words
target_word_frequencies = {word: fdist[word.lower()] for word in target_words}

# Print the frequency of each target word
for word, frequency in target_word_frequencies.items():
    print(f"Frequency of '{word}': {frequency}")


[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!


Frequency of 'democracy': 71
Frequency of 'freedom': 189
Frequency of 'liberty': 123
Frequency of 'equality': 26


# **Question:3 Write a Python program to display the 5 most common words in the text of "Sense and Sensibility" by Jane Austen using the Gutenberg Corpus**

In [None]:
import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist

# Ensure the necessary NLTK datasets are downloaded
nltk.download('gutenberg')

# Load the text of "Sense and Sensibility" by Jane Austen from the Gutenberg corpus
text = gutenberg.words('austen-sense.txt')

# Convert all words to lowercase for case-insensitive counting
text = [word.lower() for word in text]

# Create a frequency distribution of the words
fdist = FreqDist(text)

# Display the 5 most common words
most_common_words = fdist.most_common(5)

# Print the result
print("The 5 most common words in 'Sense and Sensibility' are:")
for word, frequency in most_common_words:
    print(f"'{word}': {frequency}")


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


The 5 most common words in 'Sense and Sensibility' are:
',': 9397
'to': 4116
'the': 4105
'.': 3975
'of': 3572
