In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Sample training data (for example purposes, "spam" vs. "ham")
train_texts = [
    "Buy now and win a prize",         # spam
    "Congratulations, you won!",        # spam
    "Limited offer just for you",       # spam
    "Normal email content",             # ham
    "Meeting tomorrow at 10am",         # ham
    "Project update attached",          # ham
]
train_labels = ["spam", "spam", "spam", "ham", "ham", "ham"]

# Create a pipeline that combines TF-IDF vectorization with a Random Forest classifier
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the classifier on the training data
pipeline.fit(train_texts, train_labels)

# Example input to classify
test_texts = [
    "Buy now and win a prize",
    "Normal email content",
    "Congratulations, you won!"
]

# Make predictions on the test data
predictions = pipeline.predict(test_texts)

# Output the predictions
for text, label in zip(test_texts, predictions):
    print(f"Text: '{text}' -> Prediction: {label}")


Text: 'Buy now and win a prize' -> Prediction: spam
Text: 'Normal email content' -> Prediction: ham
Text: 'Congratulations, you won!' -> Prediction: spam
