<a href="https://colab.research.google.com/github/tanyaclement/DH/blob/master/introDH/PresidentialSpeeches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Notebook -> created by Kameron Dunn, August 2024. Dataset ->  Miller Center of Public Affairs, University of Virginia. "Presidential Speeches: Downloadable Data." Accessed March 17, 2022. data.millercenter.org.

In [None]:
# Install necessary packages
!pip install nltk matplotlib wordcloud

# Import required libraries
import json
import re
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Load the dataset
with open("/content/sample_data/speeches.json", "r") as file:
    speeches = json.load(file)


In [None]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    return text

In [None]:
# Tokenization and stopwords removal
def tokenize_and_filter(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

In [None]:
# Combine all speeches into a single text
all_text = " ".join(speech['transcript'] for speech in speeches if 'transcript' in speech)


In [None]:
# Preprocess the combined text
cleaned_text = preprocess_text(all_text)

# Tokenize and remove stopwords
tokens = tokenize_and_filter(cleaned_text)

In [None]:
# Count overall word frequencies
word_freq = Counter(tokens)
print(word_freq)




In [None]:
# Generate a word cloud for all speeches
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

In [None]:
# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title("Overall Word Cloud for All Speeches")
plt.axis('off')
plt.tight_layout()
plt.show()

In [None]:
# Organize speeches by 25-year periods
speeches_by_period = defaultdict(list)
for speech in speeches:
    if 'transcript' in speech and 'date' in speech:
        year = int(speech['date'][:4])  # Extract the year from the date
        period_start = (year // 25) * 25  # Calculate the start of the 25-year period
        period_label = f"{period_start}-{period_start + 24}"
        speeches_by_period[period_label].append(speech['transcript'])


In [None]:
# Process each 25-year period's speeches and visualize the top 20 words chronologically
for period in sorted(speeches_by_period.keys()):
    texts = speeches_by_period[period]
    all_text = " ".join(texts)
    cleaned_text = preprocess_text(all_text)
    tokens = tokenize_and_filter(cleaned_text)
    word_freq = Counter(tokens)
    most_common_words = word_freq.most_common(20)

    # Visualize the top 20 words for each 25-year period
    labels, values = zip(*most_common_words)
    plt.figure(figsize=(10, 6))
    plt.bar(labels, values)
    plt.title(f"Top 20 Words in {period}")
    plt.xlabel('Words')
    plt.ylabel('Frequencies')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

In [None]:
# Define gendered words
gendered_words = {
    'he': 0, 'him': 0, 'his': 0, 'man': 0, 'men': 0,
    'she': 0, 'her': 0, 'hers': 0, 'woman': 0, 'women': 0
}

# Count gendered words in each 25-year period
gendered_words_by_period = defaultdict(lambda: defaultdict(int))
for period, texts in speeches_by_period.items():
    all_text = " ".join(texts)
    cleaned_text = preprocess_text(all_text)
    tokens = tokenize_and_filter(cleaned_text)

    # Count the occurrences of gendered words
    for word in tokens:
        if word in gendered_words:
            gendered_words_by_period[period][word] += 1

# Visualize gendered words by period
for period in sorted(gendered_words_by_period.keys()):
    labels, values = zip(*sorted(gendered_words_by_period[period].items()))
    plt.figure(figsize=(10, 6))
    plt.bar(labels, values, color=['blue' if word in ['he', 'him', 'his', 'man', 'men'] else 'pink' for word in labels])
    plt.title(f"Gendered Words in {period}")
    plt.xlabel('Words')
    plt.ylabel('Counts')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
