## Task completed individually by Yasmin Ebrahimi -  69.95% accuracy achieved with 150 features

## Goal
- Build a machine learning pipeline with the minimum number of features  
- Achieve over 62.5% accuracy on the test dataset  

## Result
- Test accuracy: **69.95%**  
- Threshold message: “Accuracy target (>=66%) achieved with 150 PCA components!”  
- Features used: 150 principal components  
- Model: Soft-voting ensemble of Logistic Regression, Random Forest, SVM


In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV,train_test_split

os.environ["OBJC_DISABLE_INITIALIZE_FORK_SAFETY"] = "YES"

In [34]:
# Load dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=7000, stop_words='english')
X = vectorizer.fit_transform(newsgroups.data)  # Features (18846 samples, 2000 features each)
y = newsgroups.target  # Labels (digits 0 to19)

In [35]:
# Convert sparse to dense for PCA
X_dense = X.toarray()

In [36]:
# Split into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42, stratify=y)

In [37]:
# Check the shape of X_train
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (15076, 7000)
y_train shape: (15076,)


In [38]:
# Check the proportion of non-zero elements (sparseness)
nonzero_ratio = np.count_nonzero(X_train) / (X_train.shape[0] * X_train.shape[1])
print(f"Non-zero Ratio: {nonzero_ratio:.4f}")

Non-zero Ratio: 0.0068


In [None]:
# View the eigenvalue distribution
# plt.figure(figsize=(8,4))
# sns.histplot(X_train.flatten(), bins=50, kde=True)
# plt.title("Distribution of feature values (TF-IDF)")
# plt.xlabel("Feature value")
# plt.ylabel("Count")
# plt.show()

In [39]:
# View category distribution
unique, counts = np.unique(y_train, return_counts=True)
for label, count in zip(unique, counts):
    print(f"Label {label}: {count} samples")

Label 0: 639 samples
Label 1: 778 samples
Label 2: 788 samples
Label 3: 786 samples
Label 4: 770 samples
Label 5: 790 samples
Label 6: 780 samples
Label 7: 792 samples
Label 8: 797 samples
Label 9: 795 samples
Label 10: 799 samples
Label 11: 793 samples
Label 12: 787 samples
Label 13: 792 samples
Label 14: 790 samples
Label 15: 798 samples
Label 16: 728 samples
Label 17: 752 samples
Label 18: 620 samples
Label 19: 502 samples


In [40]:
# Standard scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# PCA: Top 150 features
pca = PCA(n_components=150, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [41]:
# Optimized classifiers
log_reg = LogisticRegression(C=1.5, solver='saga', class_weight='balanced', max_iter=3000, random_state=42)

rf = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=5, random_state=42)

svm = SVC(C=2, gamma=0.005, kernel='rbf', probability=True, random_state=42)

xgb = XGBClassifier(n_estimators=250, max_depth=5, learning_rate=0.05, subsample=0.9, colsample_bytree=0.8, random_state=42, n_jobs=-1)

# Weighted Voting Classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg), ('rf', rf), ('svm', svm), ('xgb', xgb)],
    voting='soft',
    weights=[1, 2, 3, 3]  # Prioritize stronger models
)

# Train
voting_clf.fit(X_train_pca, y_train)

# Predict & evaluate
y_pred = voting_clf.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy*100:.2f}%")

if accuracy >= 0.68:
    print("Accuracy target (>=68%) achieved with 150 PCA components!")
else:
    print("Accuracy <66%. Consider further tuning.")


Test Accuracy: 69.95%
Accuracy target (>=68%) achieved with 150 PCA components!
