In [15]:
"""
Demo 1: Text feature extraction using TF-IDF on 20 Newsgroups
Demo 2: Image feature extraction using pixel intensities on Digits dataset
"""

import numpy as np

from sklearn.datasets import fetch_20newsgroups, load_digits
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression as ImgLogReg


In [16]:
# ----------------------------
# DEMO 1: TEXT DATASET (20 Newsgroups)
# ----------------------------
def text_feature_extraction_demo():
    print("\n" + "=" * 70)
    print("DEMO 1: TEXT FEATURE EXTRACTION (20 Newsgroups + TF-IDF)")
    print("=" * 70)

    categories = ["rec.sport.hockey", "sci.space", "talk.politics.mideast", "comp.graphics"]

    data = fetch_20newsgroups(
        subset="train",
        categories=categories,
        remove=("headers", "footers", "quotes")
    )
    
    X_text = data.data
    y = data.target

    print(f"Samples: {len(X_text)}")
    print(f"Classes: {len(data.target_names)} -> {data.target_names}")
    print("\nExample raw text snippet:")
    print(X_text[0][:400].replace("\n", " ") + " ...")

    X_train, X_test, y_train, y_test = train_test_split(
        X_text, y, test_size=0.25, random_state=42, stratify=y
    )

    # Pipeline: raw text -> TF-IDF numeric matrix -> classifier
    text_model = Pipeline(steps=[
        ("tfidf", TfidfVectorizer(
            lowercase=True,
            stop_words="english",
            ngram_range=(1, 2),
            max_features=20000
        )),
        ("clf", LogisticRegression(max_iter=2000))
    ])

    text_model.fit(X_train, y_train)
    preds = text_model.predict(X_test)

    print("\nTF-IDF Feature Space (after fit):")
    tfidf = text_model.named_steps["tfidf"]
    print(f"Number of features learned: {len(tfidf.get_feature_names_out())}")

    print("\nResults:")
    print("Accuracy:", round(accuracy_score(y_test, preds), 4))
    print(classification_report(y_test, preds, target_names=data.target_names))



In [19]:
# ----------------------------
# DEMO 2: IMAGE DATASET (Digits)
# ----------------------------
def image_feature_extraction_demo():
    print("\n" + "=" * 70)
    print("DEMO 2: IMAGE FEATURE EXTRACTION (Digits + Pixel Features)")
    print("=" * 70)

    digits = load_digits()
    X_images = digits.images  # shape: (n_samples, 8, 8)
    y = digits.target

    print(f"Images shape: {X_images.shape} (n_samples, height, width)")
    print(f"Labels shape: {y.shape}, classes: {np.unique(y)}")

    # "Feature extraction" here: convert each 8x8 image into a 64-length numeric feature vector
    X = X_images.reshape(len(X_images), -1)  # flatten
    print(f"Flattened feature matrix shape: {X.shape} (n_samples, 64 pixel-features)")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )

    # Pipeline: numeric pixels -> scaling -> classifier
    img_model = Pipeline(steps=[
        ("scaler", StandardScaler()),
        ("clf", ImgLogReg(max_iter=5000))
    ])

    img_model.fit(X_train, y_train)
    preds = img_model.predict(X_test)

    print("\nResults:")
    print("Accuracy:", round(accuracy_score(y_test, preds), 4))
    print(classification_report(y_test, preds))


In [17]:
text_feature_extraction_demo()



DEMO 1: TEXT FEATURE EXTRACTION (20 Newsgroups + TF-IDF)
Samples: 2341
Classes: 4 -> ['comp.graphics', 'rec.sport.hockey', 'sci.space', 'talk.politics.mideast']

Example raw text snippet:
: 8~> I require BGI drivers for Super VGA Displays and Super XVGA Displays. Does  : 8~> anyone know where I could obtain the relevant drivers ? (FTP sites ??)  : 	I would like to know too!  : Regards, : Dominic  garbo.uwasa.fi (or one of its many mirrors) has a file called "svgabg40" in the programming subdirectory. These are svga bgi drivers for a variety of cards.  [from the README]: "Card types ...

TF-IDF Feature Space (after fit):
Number of features learned: 20000

Results:
Accuracy: 0.9181
                       precision    recall  f1-score   support

        comp.graphics       0.92      0.93      0.93       146
     rec.sport.hockey       0.96      0.87      0.91       150
            sci.space       0.83      0.96      0.89       149
talk.politics.mideast       0.99      0.91      0.95     

In [18]:
image_feature_extraction_demo()


DEMO 2: IMAGE FEATURE EXTRACTION (Digits + Pixel Features)
Images shape: (1797, 8, 8) (n_samples, height, width)
Labels shape: (1797,), classes: [0 1 2 3 4 5 6 7 8 9]
Flattened feature matrix shape: (1797, 64) (n_samples, 64 pixel-features)

Results:
Accuracy: 0.9778
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        45
           1       0.93      0.93      0.93        46
           2       1.00      1.00      1.00        44
           3       1.00      1.00      1.00        46
           4       0.98      1.00      0.99        45
           5       1.00      0.98      0.99        46
           6       1.00      0.98      0.99        45
           7       0.98      1.00      0.99        45
           8       0.91      0.93      0.92        43
           9       0.98      0.96      0.97        45

    accuracy                           0.98       450
   macro avg       0.98      0.98      0.98       450
weighted avg       0.98    