In [3]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from preprocess_data import preprocess_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize


In [4]:
# Load the data
data = preprocess_data()

data['num_genres'] = data['genre'].apply(len)
data = data[data['num_genres'] == 1]

descriptions = data['description_processed'].tolist()
genres = data['genre'].tolist()

In [5]:
# Encode the genres as numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(genres)

# Load a pre-trained sentence-transformer model to convert text to embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')


  y = column_or_1d(y, warn=True)


In [16]:
# Convert descriptions to vector embeddings
X = model.encode(descriptions, show_progress_bar=True)

# Split the data into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the embeddings for cosine similarity
X_train_full_normalized = normalize(X_train_full, axis=1, norm='l2')
X_test_normalized = normalize(X_test, axis=1, norm='l2')

# Active Learning parameters
initial_train_size = 50 
iterations = 38  
sample_size = 500 

#Selecting initial training set randomly
np.random.seed(42)
pool_indices = np.random.choice(len(X_train_full_normalized), initial_train_size, replace=False)
X_train = X_train_full_normalized[pool_indices]
y_train = np.array(y_train_full)[pool_indices]


# Initialize the FAISS Index for Cosine Similarity
faiss.omp_set_num_threads(12)
embedding_dim = X_train_full_normalized.shape[1]
index = faiss.IndexFlatIP(embedding_dim)  # Inner product index for cosine similarity
index.add(X_train_full_normalized)  # Add all normalized vectors to the index

# Remaining pool of indices
remaining_indices = list(set(range(len(X_train_full_normalized))) - set(pool_indices))

# Initialize the classifier
clf = RandomForestClassifier(n_estimators=200,max_depth=10,min_samples_split=5,  random_state=42)

Batches: 100%|██████████| 762/762 [01:03<00:00, 12.07it/s]


In [15]:
# Active Learning Loop
for iteration in range(iterations):
    # Define number of clusters
    num_clusters = min(int(np.sqrt(len(remaining_indices))), len(remaining_indices))  
    
    # Initialize FAISS clustering
    clustering = faiss.Clustering(embedding_dim, num_clusters)
    clustering.verbose = False
    clustering.niter = 50  # Number of iterations for clustering
    
    # Convert remaining indices to the appropriate format
    remaining_data = np.array([X_train_full_normalized[i] for i in remaining_indices]).astype('float32')
    index_flat = faiss.IndexFlatL2(embedding_dim)  # Initialize FAISS index for clustering
    clustering.train(remaining_data, index_flat)
    
    # Get cluster assignments
    D, cluster_assignments = index_flat.search(remaining_data, 1)  
    
    # Convert FAISS centroids to numpy array
    centroids = faiss.vector_to_array(clustering.centroids).reshape(num_clusters, embedding_dim)

    # Select samples from each cluster
    selected_indices = []
    for cluster in range(num_clusters):
        cluster_indices = [i for i, label in zip(remaining_indices, cluster_assignments) if label[0] == cluster]
        if cluster_indices:
            # Find the closest sample to the cluster center
            cluster_center = centroids[cluster].reshape(1, -1)
            distances, _ = index.search(cluster_center, len(cluster_indices))
            closest_sample_index = cluster_indices[distances[0].argmin()]
            selected_indices.append(closest_sample_index)

    # Add selected samples to the training set
    X_train = np.vstack((X_train, X_train_full_normalized[selected_indices]))
    y_train = np.concatenate((y_train, np.array(y_train_full)[selected_indices]))

    # Update remaining indices
    remaining_indices = list(set(remaining_indices) - set(selected_indices))

    # Update FAISS Index with new training data
    index.reset()
    index.add(X_train)

    # Train the classifier on the updated training set
    clf.fit(X_train, y_train)

    # Evaluate the classifier on the test set after each iteration
    y_pred = clf.predict(X_test_normalized)
    print(f"Iteration {iteration + 1}: Test Accuracy = {accuracy_score(y_test, y_pred):.4f}")

Iteration 1: Test Accuracy = 0.4925
Iteration 2: Test Accuracy = 0.4937
Iteration 3: Test Accuracy = 0.5011
Iteration 4: Test Accuracy = 0.5073
Iteration 5: Test Accuracy = 0.5208
Iteration 6: Test Accuracy = 0.5221
Iteration 7: Test Accuracy = 0.5262
Iteration 8: Test Accuracy = 0.5155
Iteration 9: Test Accuracy = 0.5223
Iteration 10: Test Accuracy = 0.5327
Iteration 11: Test Accuracy = 0.5338
Iteration 12: Test Accuracy = 0.5266
Iteration 13: Test Accuracy = 0.5340
Iteration 14: Test Accuracy = 0.5428
Iteration 15: Test Accuracy = 0.5338
Iteration 16: Test Accuracy = 0.5389
Iteration 17: Test Accuracy = 0.5377
Iteration 18: Test Accuracy = 0.5525
Iteration 19: Test Accuracy = 0.5447
Iteration 20: Test Accuracy = 0.5383
Iteration 21: Test Accuracy = 0.5455
Iteration 22: Test Accuracy = 0.5453
Iteration 23: Test Accuracy = 0.5451
Iteration 24: Test Accuracy = 0.5416
Iteration 25: Test Accuracy = 0.5463
Iteration 26: Test Accuracy = 0.5496
Iteration 27: Test Accuracy = 0.5481
Iteration 