# Week 2 Exercise: Stemming, lemmatization, and word cloud -- Answers

This notebook contains questions and code templates to guide you through preprocessing text data.


In [None]:
import os
from glob import glob
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt


In [None]:
# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")  # Load spaCy model

### **1. Load the dataset**

In [None]:
# Define dataset path and the source you want to read from
dataset_path = 'articles/'
source_name = 'Vox'  # Change to desired source if needed, like BBC or The Guardian

# Correct the glob pattern to find files in the specified source folder across all dates
newspaperfiles = glob(os.path.join(dataset_path, f'*/{source_name}/*'))

# Initialize a list to hold documents
documents = []

# Read files and handle encoding errors if necessary
for filename in newspaperfiles:
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            documents.append(f.read())
    except Exception as e:
        print(f"Error reading {filename}: {e}")

print(f"Loaded {len(documents)} articles from {source_name}.")

In [None]:
documents = documents[:100] ### Note that things slow down if you take a larger sample, so for practicing try it out on a small sample

before we can continue to stemming and lemmaitizaton, we should apply tokenization

In [None]:
# Step 2: Tokenization
print("\n### Tokenization ###")
tokens = [word_tokenize(doc.lower()) for doc in documents]  # Convert to lowercase and tokenize
print("First 20 tokens from the first document:", tokens[0][:20])


### Step 3: Stopword Removal ###
print("\n### Stopword Removal ###")
stop_words = set(stopwords.words('english'))
filtered_tokens = [[word for word in doc if word.isalnum() and word not in stop_words] for doc in tokens]
print("Tokens after removing stopwords (first document):", filtered_tokens[0][:20])


### **2. Apply stemming and lemmatization**

In [None]:
### Step 4: Stemming and Lemmatization ###
print("\n### Stemming and Lemmatization ###")
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_words = [[stemmer.stem(word) for word in doc] for doc in filtered_tokens]
lemmatized_words_nltk = [[lemmatizer.lemmatize(word, pos='v') for word in doc] for doc in filtered_tokens]

# Using spaCy for lemmatization
lemmatized_words_spacy = []
for doc in filtered_tokens:
    spacy_doc = nlp(" ".join(doc))
    lemmatized_words_spacy.append([token.lemma_ for token in spacy_doc])
    
print("First 20 stemmed words (PorterStemmer, first document):", stemmed_words[0][:20])
print("First 20 lemmatized words (NLTK, first document):", lemmatized_words_nltk[0][:20])
print("First 20 lemmatized words (spaCy, first document):", lemmatized_words_spacy[0][:20])

### 3. Generate word cloud

In [None]:
print("\n### Generating Word Cloud ###")
all_lemmatized_text = " ".join([" ".join(doc) for doc in lemmatized_words_spacy])

if all_lemmatized_text:
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_lemmatized_text)

    plt.figure(figsize=(10,5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
else:
    print("No words available to generate a word cloud.")