In [31]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from preprocess_data import preprocess_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances

In [4]:
# Load the data
data = preprocess_data()

data['num_genres'] = data['genre'].apply(len)
data = data[data['num_genres'] == 1]

descriptions = data['description_processed'].tolist()
genres = data['genre'].tolist()

In [5]:
# Encode the genres as numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(genres)

# Load a pre-trained sentence-transformer model to convert text to embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')


  y = column_or_1d(y, warn=True)


In [33]:
# Convert descriptions to vector embeddings
X = model.encode(descriptions, show_progress_bar=True)

# Split the data into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the embeddings for cosine similarity
X_train_full_normalized = normalize(X_train_full, axis=1, norm='l2')
X_test_normalized = normalize(X_test, axis=1, norm='l2')

# Active Learning parameters
initial_train_size = 50 
iterations = 20
sample_size = 1000

#Selecting initial training set randomly
np.random.seed(42)
pool_indices = np.random.choice(len(X_train_full_normalized), initial_train_size, replace=False)
X_train = X_train_full_normalized[pool_indices]
y_train = np.array(y_train_full)[pool_indices]


# Initialize the FAISS Index for Cosine Similarity
faiss.omp_set_num_threads(12)
embedding_dim = X_train_full_normalized.shape[1]
index = faiss.IndexFlatIP(embedding_dim)  # Inner product index for cosine similarity
index.add(X_train_full_normalized)  # Add all normalized vectors to the index

# Remaining pool of indices
remaining_indices = list(set(range(len(X_train_full_normalized))) - set(pool_indices))

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

Batches:   0%|          | 0/762 [00:00<?, ?it/s]

Batches: 100%|██████████| 762/762 [01:03<00:00, 12.05it/s]


In [34]:
clf.fit(X_train, y_train)

# Active Learning Loop
for iteration in range(iterations):
    # Define number of clusters
    num_clusters = min(int(np.sqrt(len(remaining_indices))), len(remaining_indices))
    
    # Initialize FAISS clustering
    clustering = faiss.Clustering(embedding_dim, num_clusters)
    clustering.verbose = False
    clustering.niter = 50  # Number of iterations for clustering
    
    # Convert remaining indices to the appropriate format
    remaining_data = np.array([X_train_full_normalized[i] for i in remaining_indices]).astype('float32')
    index_flat = faiss.IndexFlatL2(embedding_dim)  # Initialize FAISS index for clustering
    clustering.train(remaining_data, index_flat)
    
    # Get cluster assignments
    D, cluster_assignments = index_flat.search(remaining_data, 1)  
    
    # Convert FAISS centroids to numpy array
    centroids = faiss.vector_to_array(clustering.centroids).reshape(num_clusters, embedding_dim)

    # Select samples from each cluster using a hybrid strategy
    selected_indices = []
    for cluster in range(num_clusters):
        cluster_indices = [i for i, label in zip(remaining_indices, cluster_assignments) if label[0] == cluster]
        if cluster_indices:
            # Find the closest sample to the cluster center
            cluster_center = centroids[cluster].reshape(1, -1)
            distances, _ = index.search(cluster_center, len(cluster_indices))
            closest_sample_index = cluster_indices[distances[0].argmin()]
            selected_indices.append(closest_sample_index)

            # Diversity Sampling: Select most diverse samples in the cluster
            cluster_data = X_train_full_normalized[cluster_indices]
            pairwise_distances = cosine_distances(cluster_data)
            diversity_scores = pairwise_distances.mean(axis=1)
            most_diverse_index = cluster_indices[np.argmax(diversity_scores)]
            selected_indices.append(most_diverse_index)

    # Uncertainty sampling
    if len(remaining_indices) > 0:
        # Predict probabilities for remaining samples
        probs = clf.predict_proba(X_train_full_normalized[remaining_indices])
        # Calculate uncertainty as 1 - max probability
        uncertainty = 1 - np.max(probs, axis=1)
        uncertain_indices = np.argsort(-uncertainty)[:len(selected_indices)]

        # Add uncertain samples to selected indices
        selected_indices.extend([remaining_indices[i] for i in uncertain_indices])

    # Add selected samples to the training set
    X_train = np.vstack((X_train, X_train_full_normalized[selected_indices]))
    y_train = np.concatenate((y_train, np.array(y_train_full)[selected_indices]))

    # Update remaining indices
    remaining_indices = list(set(remaining_indices) - set(selected_indices))

    # Update FAISS Index with new training data
    index.add(X_train_full_normalized[selected_indices])


    # Train the classifier on the updated training set
    clf.fit(X_train, y_train)

    # Evaluate the classifier on the test set after each iteration
    y_pred = clf.predict(X_test_normalized)
    print(f"Iteration {iteration + 1}: Test Accuracy = {accuracy_score(y_test, y_pred):.4f}")


Iteration 1: Test Accuracy = 0.5194
Iteration 2: Test Accuracy = 0.5410
Iteration 3: Test Accuracy = 0.5442
Iteration 4: Test Accuracy = 0.5492
Iteration 5: Test Accuracy = 0.5488
Iteration 6: Test Accuracy = 0.5551
Iteration 7: Test Accuracy = 0.5570
Iteration 8: Test Accuracy = 0.5623
Iteration 9: Test Accuracy = 0.5559
Iteration 10: Test Accuracy = 0.5660
Iteration 11: Test Accuracy = 0.5607
Iteration 12: Test Accuracy = 0.5662
Iteration 13: Test Accuracy = 0.5625
Iteration 14: Test Accuracy = 0.5580
Iteration 15: Test Accuracy = 0.5666
Iteration 16: Test Accuracy = 0.5664
Iteration 17: Test Accuracy = 0.5656
Iteration 18: Test Accuracy = 0.5672
Iteration 19: Test Accuracy = 0.5672
Iteration 20: Test Accuracy = 0.5676


In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.39      0.08      0.14       134
           1       0.00      0.00      0.00        47
           2       0.00      0.00      0.00        19
           3       0.00      0.00      0.00        10
           4       0.56      0.36      0.44      1371
           5       0.00      0.00      0.00        35
           6       0.56      0.90      0.69      2233
           7       0.00      0.00      0.00        53
           8       0.00      0.00      0.00        15
           9       0.00      0.00      0.00         5
          10       0.70      0.46      0.55       406
          11       0.00      0.00      0.00         5
          12       0.00      0.00      0.00        16
          13       0.00      0.00      0.00        23
          14       0.00      0.00      0.00        80
          15       0.89      0.31      0.46        55
          16       0.00      0.00      0.00         3
          17       0.14    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
