# 📘 ML Pipeline Reference Notebook

This notebook is a **comprehensive end-to-end guide** to solving a text classification problem using:
- `CountVectorizer` for feature extraction
- `LogisticRegression` for classification
- Basic feature engineering (e.g., question mark counts)
- `hstack` to combine features
- `Pipeline` and `ColumnTransformer` for clean code
- All key metrics: Accuracy, Precision, Recall, F1

✅ This is your go-to working version for interview preparation.


In [None]:
# ✅ Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer


In [None]:
# ✅ Step 2: Create a Sample Dataset
data = pd.DataFrame({
    'text': [
        'You won’t believe what happened next!',
        'Mayor announces new housing plan',
        'How to lose weight with no effort?',
        'Breaking: Local team wins championship',
        'This one trick will save you thousands?',
        'The new update crashed my app!',
        'Scientists discover new particle',
        'Why is the app so slow?',
        'App notifications stopped working',
        'Weather report for the weekend'
    ],
    'label': [1, 0, 1, 0, 1, 1, 0, 1, 1, 0]
})

# Add a simple feature: number of question marks in each text
data['question_marks'] = data['text'].apply(lambda x: x.count('?'))
data

In [None]:
# ✅ Step 3: Train/Test Split
X = data[['text', 'question_marks']]
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# ✅ Step 4: ColumnTransformer + Pipeline
# CountVectorizer on 'text', passthrough on 'question_marks'

text_transform = CountVectorizer(stop_words='english')
qmark_transform = FunctionTransformer(lambda x: x[['question_marks']], validate=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transform, 'text'),
        ('qmark', qmark_transform, ['question_marks'])
    ]
)

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression())
])

# Train pipeline
pipeline.fit(X_train, y_train)

In [None]:
# ✅ Step 5: Evaluate the Model
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

In [None]:
# ✅ Step 6: Inspect Model Coefficients (Text Only)
feature_names = pipeline.named_steps['preprocessing'].named_transformers_['text'].get_feature_names_out()
coefficients = pipeline.named_steps['classifier'].coef_[0][:len(feature_names)]

top_features = sorted(zip(feature_names, coefficients), key=lambda x: abs(x[1]), reverse=True)[:5]
print("Top 5 influential words:")
for word, coef in top_features:
    print(f"{word}: {coef:.4f}")