In [2]:
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_rcv1
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import hamming_loss
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.datasets import fetch_rcv1
rcv1 = fetch_rcv1()

In [4]:
# Extract features (X) and target labels (y) from the dataset
X = rcv1.data
y = rcv1.target

In [5]:
# Specify the number of samples for training and testing
num_train_samples = 7000
num_test_samples = 3000

# Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=num_train_samples, test_size=num_test_samples, shuffle=False
)

In [6]:
# Convert y_train and X_train to dense arrays
y_train = y_train.toarray()
X_train = X_train.toarray()
# Convert y_test and X_test to dense arrays
X_test = X_test.toarray()
y_test = y_test.toarray()

In [7]:
# Define feature names for both training and testing sets
feature_names = [f"feature_{i}" for i in range(X_train.shape[1])]
feature_df_train = pd.DataFrame(X_train, columns=feature_names)

# Create DataFrames for training features and testing features
feature_names = [f"feature_{i}" for i in range(X_test.shape[1])]
feature_df_test = pd.DataFrame(X_test, columns=feature_names)

# Create DataFrames for training targets and testing targets
target_names = rcv1.target_names
target_df_train = pd.DataFrame(y_train, columns=target_names)
target_df_test = pd.DataFrame(y_test, columns=target_names)

In [8]:
# Identify columns with all zero values in the training features
zero_columns = feature_df_train.columns[feature_df_train.eq(0).all()]

# Drop columns with all zero values from training and testing feature DataFrames
feature_df_train.drop(columns=zero_columns, inplace=True)
feature_df_test.drop(columns=zero_columns, inplace=True)

In [9]:
# Identify columns with all zero values in the training targets
zero_columns = target_df_train.columns[target_df_train.eq(0).all()]

# Drop columns with all zero values from training and testing target DataFrames
target_df_train.drop(columns=zero_columns, inplace=True)
target_df_test.drop(columns=zero_columns, inplace=True)

In [10]:
# Scaling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(feature_df_train)
X_test_scaled = scaler.transform(feature_df_test)

In [11]:
# Perform PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [13]:
# After PCA SVC
# Create a linear SVM model
svm_model = MultiOutputClassifier(SVC(kernel='linear'))

# Train the model on the training set
svm_model.fit(X_train_pca, target_df_train)

# Make predictions on the testing set
y_pred = svm_model.predict(X_test_pca)

# Evaluate the performance
hamming_loss_value = hamming_loss(target_df_test, y_pred)
precision = precision_score(target_df_test, y_pred, average='micro')
recall = recall_score(target_df_test, y_pred, average='micro')
accuracy = accuracy_score(target_df_test, y_pred)
jaccard_similarity = sklearn.metrics.jaccard_score(target_df_test, y_pred, average='samples')
# Print the evaluation metrics 
print("Metrics with PCA-transformed features:")
print(f"Hamming Loss: {hamming_loss_value * 100:.2f}%")
print(f"Weighted Precision: {precision * 100:.2f}%")
print(f"Weighted Recall: {recall * 100:.2f}%")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Exact Match Ratio: {accuracy * 100:.2f}%")
print(f"Jaccard Similarity: {jaccard_similarity * 100:.2f}%")


Metrics with PCA-transformed features:
Hamming Loss: 1.14%
Weighted Precision: 88.53%
Weighted Recall: 75.32%
Accuracy: 52.23%
Exact Match Ratio: 52.23%
Jaccard Similarity: 74.32%


In [14]:
# Decision Tree with PCA
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import hamming_loss
import sklearn.metrics

# Initialize the multi-output classifier with DecisionTreeClassifier
clf = MultiOutputClassifier(DecisionTreeClassifier())

# Train the classifier
clf.fit(X_train_pca, target_df_train)

# Predict on the test set
y_pred = clf.predict(X_test_pca)

# Evaluate the performance
hamming_loss_value = hamming_loss(target_df_test, y_pred)
precision = precision_score(target_df_test, y_pred, average='weighted')
recall = recall_score(target_df_test, y_pred, average='weighted')
accuracy = accuracy_score(target_df_test, y_pred)
jaccard_similarity = sklearn.metrics.jaccard_score(target_df_test, y_pred, average='samples')
# Print the evaluation metrics 
print(f"Weighted Precision: {precision * 100:.2f}%")
print(f"Weighted Recall: {recall * 100:.2f}%")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Jaccard Similarity: {jaccard_similarity * 100:.2f}%")
print('Exact Match Ratio: {0}'.format(sklearn.metrics.accuracy_score(target_df_test, y_pred, normalize=True, sample_weight=None)))


In [12]:
# Random forest with PCA
# Create a RandomForestClassifier instance
rf_classifier_pca = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier on the training data with PCA-transformed features
rf_classifier_pca.fit(X_train_pca, target_df_train)

# Make predictions on the test set with PCA-transformed features
predictions_pca = rf_classifier_pca.predict(X_test_pca)

# Evaluate the performance of the model with PCA-transformed features
accuracy_pca = accuracy_score(target_df_test, predictions_pca)
precision_pca = precision_score(target_df_test, predictions_pca, average='micro')
recall_pca = recall_score(target_df_test, predictions_pca, average='micro')


# Calculate Hamming Loss with PCA-transformed features
hamming_loss_value_pca = hamming_loss(target_df_test, predictions_pca)
jaccard_similarity_pca = sklearn.metrics.jaccard_score(target_df_test, predictions_pca, average='samples')
# Print the evaluation metrics with PCA-transformed features
print("Metrics with PCA-transformed features:")
print(f"Hamming Loss: {hamming_loss_value_pca * 100:.2f}%")
print(f"Weighted Precision: {precision_pca * 100:.2f}%")
print(f"Weighted Recall: {recall_pca * 100:.2f}%")
print(f"Accuracy: {accuracy_pca * 100:.2f}%")
print(f"Exact Match Ratio: {accuracy_pca * 100:.2f}%")
print(f"Jaccard Similarity: {jaccard_similarity_pca * 100:.2f}%")

Metrics with PCA-transformed features:
Hamming Loss: 2.57%
Weighted Precision: 89.29%
Weighted Recall: 25.38%
Accuracy: 11.03%
Exact Match Ratio: 11.03%
Jaccard Similarity: 26.17%
