In [None]:
# --- Q1: Text Preprocessing & Representation ---

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# download only once
nltk.download('punkt')
nltk.download('stopwords')

# sample text (can be changed in exam)
texts = ["Natural Language Processing is fun!",
         "Language is a key to communication."]

# 1. Preprocess text
stop_words = set(stopwords.words('english'))
clean_texts = []
for t in texts:
    tokens = word_tokenize(t.lower())
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]
    clean_texts.append(" ".join(filtered))
print("Preprocessed:", clean_texts)

# 2. Bag of Words
bow = CountVectorizer()
bow_matrix = bow.fit_transform(clean_texts)
print("\nBOW:\n", bow_matrix.toarray())

# 3. TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(clean_texts)
print("\nTF-IDF:\n", tfidf_matrix.toarray())

"""
Possible Errors & Fixes:
1. LookupError: punkt/stopwords not found → run nltk.download('punkt'), nltk.download('stopwords')
2. ValueError: empty vocabulary → ensure texts not empty
3. UnicodeDecodeError → use plain English or UTF-8 strings
"""


In [None]:
# --- Q1: Text Preprocessing & Representation ---

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Download dependencies (only first time)
nltk.download('punkt')
nltk.download('stopwords')

# Sample text
text1 = "Natural Language Processing is amazing!"
text2 = "Language Processing with Python is powerful."

# 1. Text Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    clean_tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    return ' '.join(clean_tokens)

clean_texts = [preprocess(text1), preprocess(text2)]
print("Preprocessed Texts:", clean_texts)

# 2. Representation - Bag of Words
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(clean_texts)
print("\nBag of Words Representation:\n", bow_matrix.toarray())

# 3. Representation - TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(clean_texts)
print("\nTF-IDF Representation:\n", tfidf_matrix.toarray())

# Comparison
print("\nFeature Names:", bow_vectorizer.get_feature_names_out())

"""
Possible Errors & Fixes:
1. LookupError: 'punkt' not found → Run nltk.download('punkt')
2. LookupError: 'stopwords' not found → Run nltk.download('stopwords')
3. ValueError: Empty vocabulary → Ensure text isn't empty or all stopwords.
4. Encoding errors in text → Use UTF-8 encoding or str() conversion.
"""
