In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import numpy as np

# Load dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# Convert sparse to dense for PCA
X_dense = X.toarray()

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_dense, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")
print(f"Number of classes: {len(set(y))}")

# Build pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Important for SVM
    ('pca', PCA(n_components=500, random_state=42)),
    ('svm', SVC(kernel='rbf', random_state=42))
])

# Parameter grid for SVM
param_grid = {
    'svm__C': [0.1, 1, 10],
    'svm__gamma': ['scale', 0.01, 0.001]
}

# Grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Results
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV accuracy: {grid_search.best_score_:.4f}")

# Test performance
test_score = grid_search.score(X_test, y_test)
print(f"Test accuracy: {test_score:.4f}")
# Show explained variance
explained_var = np.sum(grid_search.best_estimator_.named_steps['pca'].explained_variance_ratio_)
print(f"Explained variance by PCA: {explained_var:.4f}")

Training set size: 15076
Test set size: 3770
Number of features: 3000
Number of classes: 20
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best parameters: {'svm__C': 1, 'svm__gamma': 'scale'}
Best CV accuracy: 0.6396
Test accuracy: 0.6629
Explained variance by PCA: 0.3381
