# S05 - Text Classification: Logistic Regression & Naive Bayes
## Exercises

### Exercise 1 (Easy)
Convert texts to Bag-of-Words representation using sklearn.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

texts = ["I love this movie", "This movie is terrible", "Great film!", "Waste of time"]

# Create BoW representation
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(texts)

# Convert to array
print("Bag-of-Words Matrix:")
print(bow_matrix.toarray())

# Show vocabulary
print("\nVocabulary:")
print(vectorizer.get_feature_names_out())

Bag-of-Words Matrix:
[[0 0 0 1 1 0 0 1 0 0]
 [0 0 1 0 1 0 1 1 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 1]]

Vocabulary:
['film' 'great' 'is' 'love' 'movie' 'of' 'terrible' 'this' 'time' 'waste']


### Exercise 2 (Easy)
Convert the same texts to TF-IDF representation.

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts = ["I love this movie", "This movie is terrible", "Great film!", "Waste of time"]
# Create TF-IDF representation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)

# Convert to array
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Show vocabulary
print("\nVocabulary:")
print(vectorizer.get_feature_names_out())

TF-IDF Matrix:
[[0.         0.         0.         0.66767854 0.52640543 0.
  0.         0.52640543 0.         0.        ]
 [0.         0.         0.55528266 0.         0.43779123 0.
  0.55528266 0.43779123 0.         0.        ]
 [0.70710678 0.70710678 0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.57735027
  0.         0.         0.57735027 0.57735027]]

Vocabulary:
['film' 'great' 'is' 'love' 'movie' 'of' 'terrible' 'this' 'time' 'waste']


### Exercise 3 (Medium)
Train a Naive Bayes classifier for sentiment analysis.

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

texts = ["I love this movie", "Great film", "Excellent acting", "Best movie ever",
         "Terrible movie", "Waste of time", "Awful acting", "Worst film"]
labels = [1, 1, 1, 1, 0, 0, 0, 0]  # 1=positive, 0=negative

# Convert text to TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
# Train Naive Bayes classifier
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.25, random_state=42
)

# Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Exercise 4 (Medium)
Train a Logistic Regression classifier and compare with Naive Bayes.

In [None]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression and compare accuracy with Naive Bayes


### Exercise 5 (Hard - Research)
Implement Naive Bayes from scratch (without sklearn) for text classification.

*Hint: Use log probabilities to avoid underflow. Research: P(c|d) ∝ P(c) × Π P(w|c)*

In [None]:
import math
from collections import defaultdict, Counter

class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.word_probs = defaultdict(dict)
    
    def fit(self, texts, labels):
        # Your implementation
        pass
    
    def predict(self, text):
        # Your implementation
        pass
