In [57]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score,classification_report,pairwise_distances_argmin_min,silhouette_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from preprocess_data import preprocess_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances,pairwise_distances
from sklearn.cluster import KMeans
from collections import Counter



In [68]:
# Load the data
data = preprocess_data()

data['num_genres'] = data['genre'].apply(len)
data = data[data['num_genres'] == 1]

descriptions = data['description_processed'].tolist()
genres = data['genre'].tolist()

In [69]:
# Encode the genres as numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(genres)

# Load a pre-trained sentence-transformer model to convert text to embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')


  y = column_or_1d(y, warn=True)




In [70]:
# Convert descriptions to vector embeddings
X = model.encode(descriptions, show_progress_bar=True)

# Split the data into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the embeddings for cosine similarity
X_train_full_normalized = normalize(X_train_full, axis=1, norm='l2')
X_test_normalized = normalize(X_test, axis=1, norm='l2')

# Active Learning parameters
initial_train_size = 50 
iterations = 20
sample_size = 1000
uncertainty_threshold = 0.2
budget_per_iteration = 500    

#Selecting initial training set randomly
np.random.seed(42)
pool_indices = np.random.choice(len(X_train_full_normalized), initial_train_size, replace=False)
X_train = X_train_full_normalized[pool_indices]
y_train = np.array(y_train_full)[pool_indices]


# Initialize the FAISS Index for Cosine Similarity
faiss.omp_set_num_threads(12)
embedding_dim = X_train_full_normalized.shape[1]
index = faiss.IndexFlatIP(embedding_dim)  # Inner product index for cosine similarity
index.add(X_train_full_normalized)  # Add all normalized vectors to the index

# Remaining pool of indices
remaining_indices = list(set(range(len(X_train_full_normalized))) - set(pool_indices))

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)


Batches: 100%|██████████| 762/762 [01:04<00:00, 11.80it/s]


In [51]:
# Active Learning Loop
for iteration in range(iterations):
    # Define number of clusters
    num_clusters = min(int(np.sqrt(len(remaining_indices))), len(remaining_indices))
    
    # Initialize FAISS clustering
    clustering = faiss.Clustering(embedding_dim, num_clusters)
    clustering.verbose = False
    clustering.niter = 50  # Number of iterations for clustering
    
    # Convert remaining indices to the appropriate format
    remaining_data = np.array([X_train_full_normalized[i] for i in remaining_indices]).astype('float32')
    index_flat = faiss.IndexFlatL2(embedding_dim)  # Initialize FAISS index for clustering
    clustering.train(remaining_data, index_flat)
    
    # Get cluster assignments
    D, cluster_assignments = index_flat.search(remaining_data, 1)
    
    # Convert FAISS centroids to numpy array
    centroids = faiss.vector_to_array(clustering.centroids).reshape(num_clusters, embedding_dim)

    # Select samples from each cluster using a hybrid strategy
    selected_indices = []
    for cluster in range(num_clusters):
        cluster_indices = [i for i, label in zip(remaining_indices, cluster_assignments) if label == cluster]
        if cluster_indices:
            # Find the closest sample to the cluster center
            cluster_center = centroids[cluster].reshape(1, -1)
            distances, _ = index.search(cluster_center, len(cluster_indices))
            closest_sample_index = cluster_indices[distances[0].argmin()]
            selected_indices.append(closest_sample_index)

            # Diversity Sampling: Select most diverse samples in the cluster
            cluster_data = X_train_full_normalized[cluster_indices]
            pairwise_distances_matrix = pairwise_distances(cluster_data)
            diversity_scores = pairwise_distances_matrix.mean(axis=1)
            most_diverse_index = cluster_indices[np.argmax(diversity_scores)]
            selected_indices.append(most_diverse_index)

    # Uncertainty sampling
    if len(remaining_indices) > 0:
        # Predict probabilities for remaining samples
        probs = clf.predict_proba(X_train_full_normalized[remaining_indices])
        # Calculate uncertainty as 1 - max probability
        uncertainty = 1 - np.max(probs, axis=1)
        
        # Determine the number of uncertain samples to select
        num_uncertain_samples = int(sample_size * uncertainty_threshold)
        if num_uncertain_samples > len(remaining_indices):
            num_uncertain_samples = len(remaining_indices)
        
        # Select the most uncertain samples
        uncertain_indices = np.argsort(-uncertainty)[:num_uncertain_samples]
        
        # Add uncertain samples to selected indices
        selected_indices.extend([remaining_indices[i] for i in uncertain_indices])

    # Ensure selected indices are unique and limited to the budget per iteration
    selected_indices = list(set(selected_indices))
    if len(selected_indices) > budget_per_iteration:
        selected_indices = selected_indices[:budget_per_iteration]

    # Add selected samples to the training set
    X_train = np.vstack((X_train, X_train_full_normalized[selected_indices]))
    y_train = np.concatenate((y_train, np.array(y_train_full)[selected_indices]))

    # Remove selected samples from the pool
    remaining_indices = list(set(remaining_indices) - set(selected_indices))

    # Update FAISS Index with new training data
    index.add(X_train_full_normalized[selected_indices])

    # Train the classifier on the updated training set
    clf.fit(X_train, y_train)

    # Evaluate the classifier on the test set after each iteration
    y_pred = clf.predict(X_test_normalized)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Iteration {iteration + 1}: Test Accuracy = {accuracy:.4f}")


Iteration 1: Test Accuracy = 0.5120
Iteration 2: Test Accuracy = 0.5313
Iteration 3: Test Accuracy = 0.5290
Iteration 4: Test Accuracy = 0.5405
Iteration 5: Test Accuracy = 0.5502
Iteration 6: Test Accuracy = 0.5483
Iteration 7: Test Accuracy = 0.5506
Iteration 8: Test Accuracy = 0.5516
Iteration 9: Test Accuracy = 0.5580
Iteration 10: Test Accuracy = 0.5561
Iteration 11: Test Accuracy = 0.5555
Iteration 12: Test Accuracy = 0.5586
Iteration 13: Test Accuracy = 0.5617
Iteration 14: Test Accuracy = 0.5629
Iteration 15: Test Accuracy = 0.5590
Iteration 16: Test Accuracy = 0.5535
Iteration 17: Test Accuracy = 0.5699
Iteration 18: Test Accuracy = 0.5672
Iteration 19: Test Accuracy = 0.5687
Iteration 20: Test Accuracy = 0.5639


In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.48      0.12      0.19       134
           1       0.00      0.00      0.00        47
           2       0.00      0.00      0.00        19
           3       0.00      0.00      0.00        10
           4       0.56      0.37      0.44      1371
           5       0.00      0.00      0.00        35
           6       0.56      0.89      0.68      2233
           7       0.00      0.00      0.00        53
           8       0.00      0.00      0.00        15
           9       0.00      0.00      0.00         5
          10       0.68      0.44      0.53       406
          11       0.00      0.00      0.00         5
          12       0.00      0.00      0.00        16
          13       0.00      0.00      0.00        23
          14       0.00      0.00      0.00        80
          15       0.80      0.29      0.43        55
          16       0.00      0.00      0.00         3
          17       0.30    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [72]:
minority_focus_ratio = 0.3 
dynamic_clusters = 5

In [73]:
# Active Learning Loop
for iteration in range(iterations):
    # Adjust the number of clusters dynamically
    num_clusters = dynamic_clusters + iteration  # Increase clusters gradually

    # Initialize FAISS clustering
    clustering = faiss.Clustering(embedding_dim, num_clusters)
    clustering.verbose = False
    clustering.niter = 50

    # Convert remaining indices to the appropriate format
    remaining_data = np.array([X_train_full_normalized[i] for i in remaining_indices]).astype('float32')
    index_flat = faiss.IndexFlatL2(embedding_dim)
    clustering.train(remaining_data, index_flat)

    # Get cluster assignments
    D, cluster_assignments = index_flat.search(remaining_data, 1)
    
    # Convert FAISS centroids to numpy array
    centroids = faiss.vector_to_array(clustering.centroids).reshape(num_clusters, embedding_dim)

    # Select samples from each cluster using a refined strategy
    selected_indices = []
    for cluster in range(num_clusters):
        cluster_indices = [i for i, label in zip(remaining_indices, cluster_assignments) if label == cluster]
        if cluster_indices:
            # Select the most uncertain samples within each cluster
            cluster_data = np.array([X_train_full_normalized[i] for i in cluster_indices]).astype('float32')
            probs = clf.predict_proba(cluster_data)
            uncertainty = 1 - np.max(probs, axis=1)

            # Calculate diversity by distance from the centroid
            distances_from_centroid = np.linalg.norm(cluster_data - centroids[cluster], axis=1)
            combined_scores = uncertainty + distances_from_centroid

            # Prioritize samples with high uncertainty and diversity
            sorted_indices = np.argsort(-combined_scores)
            num_samples = min(int(len(cluster_indices) * uncertainty_threshold), len(cluster_indices))
            selected_cluster_indices = [cluster_indices[i] for i in sorted_indices[:num_samples]]
            selected_indices.extend(selected_cluster_indices)

    # Ensure unique samples and limit to the budget
    selected_indices = list(set(selected_indices))
    if len(selected_indices) > budget_per_iteration:
        selected_indices = selected_indices[:budget_per_iteration]

    # Add selected samples to the training set
    X_train = np.vstack((X_train, X_train_full_normalized[selected_indices]))
    y_train = np.concatenate((y_train, np.array(y_train_full)[selected_indices]))

    # Remove selected samples from the pool
    remaining_indices = list(set(remaining_indices) - set(selected_indices))

    # Update FAISS Index with new training data
    index.add(X_train_full_normalized[selected_indices])

    # Train the classifier on the updated training set
    clf.fit(X_train, y_train)

    # Evaluate the classifier on the test set after each iteration
    y_pred = clf.predict(X_test_normalized)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Iteration {iteration + 1}: Test Accuracy = {accuracy:.4f}")

Iteration 1: Test Accuracy = 0.5085
Iteration 2: Test Accuracy = 0.5295
Iteration 3: Test Accuracy = 0.5385
Iteration 4: Test Accuracy = 0.5516
Iteration 5: Test Accuracy = 0.5512
Iteration 6: Test Accuracy = 0.5488
Iteration 7: Test Accuracy = 0.5535
Iteration 8: Test Accuracy = 0.5621
Iteration 9: Test Accuracy = 0.5600
Iteration 10: Test Accuracy = 0.5642
Iteration 11: Test Accuracy = 0.5603
Iteration 12: Test Accuracy = 0.5600
Iteration 13: Test Accuracy = 0.5625
Iteration 14: Test Accuracy = 0.5639
Iteration 15: Test Accuracy = 0.5619
Iteration 16: Test Accuracy = 0.5617
Iteration 17: Test Accuracy = 0.5672
Iteration 18: Test Accuracy = 0.5642
Iteration 19: Test Accuracy = 0.5668
Iteration 20: Test Accuracy = 0.5676
