In [1]:
# Import libraries
import re
import string
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


# Custom Text Preprocessor
class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    def clean_text(self, text):
        # Lowercase
        text = text.lower()
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove stopwords
        words = text.split()
        words = [word for word in words if word not in ENGLISH_STOP_WORDS]
        
        return " ".join(words)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [self.clean_text(text) for text in X]


# Function to build spam detection pipeline
def build_spam_pipeline():
    
    pipeline = Pipeline([
        ("preprocessing", TextPreprocessor()),
        ("vectorizer", TfidfVectorizer()),
        ("classifier", LogisticRegression())
    ])
    
    return pipeline