In [1]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from preprocess_data import preprocess_data
# Precompute the pairwise distance matrix (upper triangular)
from scipy.spatial.distance import pdist, squareform
import pandas as pd
from sklearn.model_selection import train_test_split


  from tqdm.autonotebook import tqdm, trange
2024-09-11 17:05:07.657399: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-11 17:05:07.657451: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-11 17:05:07.658760: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-11 17:05:07.667099: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_da

   index                              description_processed  \
0      0   true story notorious australian outlaw ned kelly   
1      1  two men high rank wooing beautiful famous eque...   
2      2  fabled queen egypt affair roman general marc a...   
3      3  loosely adapted dante divine comedy inspired i...   
4      4  account life jesus christ based books new test...   

                         genre  
0    [Biography, Crime, Drama]  
1                      [Drama]  
2             [Drama, History]  
3  [Adventure, Drama, Fantasy]  
4           [Biography, Drama]  
(78843, 3)
Index(['index', 'description_processed', 'genre'], dtype='object')


In [2]:
# Load your data
data = preprocess_data()

data['num_genres'] = data['genre'].apply(len)
data = data[data['num_genres'] == 1]

# Assuming data is a pandas DataFrame with 'description_processed' and 'genre' columns
descriptions = data['description_processed'].tolist()
genres = data['genre'].tolist()

In [3]:
# Encode the genres as numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(genres)

# Load a pre-trained sentence-transformer model to convert text to embeddings
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model = SentenceTransformer('bert-base-nli-mean-tokens')


# Convert descriptions to vector embeddings
X = model.encode(descriptions, show_progress_bar=True)

# Split the data into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Active Learning parameters
initial_train_size = 50  # Initial training set size
iterations = 38  # Number of iterations for active learning
sample_size = 500  # Samples to add per iteration

# Start by selecting the furthest points from each other for initial training
np.random.seed(42)
pool_indices = np.random.choice(len(X_train_full), initial_train_size, replace=False)
X_train = X_train_full[pool_indices]
y_train = y_train_full[pool_indices]

# Initialize FAISS Index for ANN search
# res = faiss.StandardGpuResources() 
faiss.omp_set_num_threads(12)
embedding_dim = X_train.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(X_train)  # Add only the training data to the index
# index = faiss.index_cpu_to_gpu(res, 0, index)

# Remaining pool of indices
remaining_indices = list(set(range(len(X_train_full))) - set(pool_indices))


  y = column_or_1d(y, warn=True)
Batches: 100%|██████████| 762/762 [01:04<00:00, 11.85it/s]


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import faiss

# Initialize classifier for single-label classification
clf = RandomForestClassifier()

# Active Learning Loop using FAISS for selecting distant points
for iteration in range(iterations):
    # Train the classifier on the current training set
    clf.fit(X_train, y_train)
    
    # Optionally evaluate the classifier on the test set after each iteration (can be commented out)
    y_pred = clf.predict(X_test)
    print(f"Iteration {iteration + 1}: Test Accuracy = {accuracy_score(y_test, y_pred):.4f}")

    # Stop if there are not enough remaining samples or if the max training set size is reached
    if len(remaining_indices) == 0 or len(remaining_indices) < sample_size or len(X_train) >= len(X_train_full):
        break

    # Search for the most distant points from the current training set
    D_pool, _ = index.search(X_train_full[remaining_indices], len(X_train))

    # With GPU
    # batch_size = 2048  # Batch size for ANN search

    # D_pool_list = []  # List to store distance results
    # for i in range(0, len(remaining_indices), batch_size):
    #     # Get the current batch of remaining indices
    #     batch_indices = remaining_indices[i:i + batch_size]
    #     # Search for the closest points in the current batch
    #     D_batch, _ = index.search(X_train_full[batch_indices], len(X_train))
    #     D_pool_list.extend(D_batch.tolist())  # Convert np array to list and extend it
    
    # # Concatenate all numpy arrays in D_pool_list along axis 0 (rows)
    # D_pool = np.vstack(D_pool_list)  # Using np.vstack to concatenate numpy arrays along rows
    
    # Compute the mean distance to the current training set for all remaining points
    avg_distances = D_pool.mean(axis=1)

    # Select the indices of the most distant points in the remaining pool
    furthest_indices_within_remaining = np.argsort(avg_distances)[-sample_size:]

    # Map the selected indices back to the original dataset indices
    selected_indices = [remaining_indices[i] for i in furthest_indices_within_remaining]

    # index.add(X_train)  # Add the current training set to FAISS

    # Add the selected samples to the training set
    X_train = np.vstack([X_train, X_train_full[selected_indices]])
    y_train = np.concatenate([y_train, y_train_full[selected_indices]], axis=0)

    # Remove the selected indices from the pool
    remaining_indices = list(set(remaining_indices) - set(selected_indices))
    
    index.add(X_train_full[selected_indices])


# Final evaluation on the test set
y_pred_final = clf.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Final Accuracy: {final_accuracy:.4f}")


Iteration 1: Test Accuracy = 0.4124
Iteration 2: Test Accuracy = 0.4315


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 3: Test Accuracy = 0.5260


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 4: Test Accuracy = 0.5317


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 5: Test Accuracy = 0.5342


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 6: Test Accuracy = 0.5356


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 7: Test Accuracy = 0.5477


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 8: Test Accuracy = 0.5518


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 9: Test Accuracy = 0.5502


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 10: Test Accuracy = 0.5580


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 11: Test Accuracy = 0.5500


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 12: Test Accuracy = 0.5504


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 13: Test Accuracy = 0.5522


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 14: Test Accuracy = 0.5588


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 15: Test Accuracy = 0.5559


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 16: Test Accuracy = 0.5681


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 17: Test Accuracy = 0.5549


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 18: Test Accuracy = 0.5619


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 19: Test Accuracy = 0.5648


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 20: Test Accuracy = 0.5613


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 21: Test Accuracy = 0.5631


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 22: Test Accuracy = 0.5635


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 23: Test Accuracy = 0.5679


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 24: Test Accuracy = 0.5666


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 25: Test Accuracy = 0.5631


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 26: Test Accuracy = 0.5623


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


Iteration 27: Test Accuracy = 0.5689


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report

# Assuming y is your true labels and y_pred_final is the predicted labels
print(classification_report(y_test, y_pred_final, target_names=label_encoder.classes_))


In [None]:
# Compute pairwise distances between all vectors
distance_matrix = squareform(pdist(X, metric='euclidean'))

In [None]:
clf = MultiOutputClassifier(RandomForestClassifier())

# Active Learning Loop with precomputed distances
for iteration in range(iterations):
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X)
    print(f"Iteration {iteration + 1}: Accuracy = {accuracy_score(y, y_pred):.4f}")
    
    # Compute distances between the current training set and the rest of the dataset using precomputed matrix
    D = distance_matrix[initial_indices, :]  # Use the indices from the current training set
    
    # Select the furthest points from the training set
    furthest_indices = np.argsort(D.mean(axis=0))[-sample_size:]
    
    # Add the selected samples to the training set
    X_train = np.vstack([X_train, X[furthest_indices]])
    y_train = np.concatenate([y_train, y[furthest_indices]], axis=0)
    
    # Remove the selected indices from the pool
    remaining_indices = list(set(remaining_indices) - set(furthest_indices))
    
    if len(remaining_indices) == 0 or len(remaining_indices) < sample_size or len(X_train) >= 600:
        break


In [None]:
# Final evaluation
y_pred_final = clf.predict(X)
final_accuracy = accuracy_score(y, y_pred_final)
print(f"Final Accuracy: {final_accuracy:.4f}")