In [1]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from preprocess_data import preprocess_data
# Precompute the pairwise distance matrix (upper triangular)
from scipy.spatial.distance import pdist, squareform


  from tqdm.autonotebook import tqdm, trange
2024-08-17 21:19:47.594231: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-17 21:19:47.594342: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-17 21:19:47.658994: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-17 21:19:47.811941: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_da

   index                              description_processed  \
0      0   true story notorious australian outlaw ned kelly   
1      1  two men high rank wooing beautiful famous eque...   
2      2  fabled queen egypt affair roman general marc a...   
3      3  loosely adapted dante divine comedy inspired i...   
4      4  account life jesus christ based books new test...   

                         genre  
0    [Biography, Crime, Drama]  
1                      [Drama]  
2             [Drama, History]  
3  [Adventure, Drama, Fantasy]  
4           [Biography, Drama]  
(78843, 3)
Index(['index', 'description_processed', 'genre'], dtype='object')


In [2]:
# Load your data
data = preprocess_data()

# Assuming data is a pandas DataFrame with 'description_processed' and 'genre' columns
descriptions = data['description_processed'].tolist()
genres = data['genre'].tolist()

In [3]:
# Encode the genres as numerical labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(genres)

# Load a pre-trained sentence-transformer model to convert text to embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Convert descriptions to vector embeddings
X = model.encode(descriptions, show_progress_bar=True)

# Initialize FAISS Index for ANN search
embedding_dim = X.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(X)  # Adding all vectors to the index

# Active Learning parameters
initial_train_size = 50  # Initial training set size
iterations = 100  # Number of iterations for active learning
sample_size = 1000  # Samples to add per iteration

# Start by selecting the furthest points from each other for initial training
np.random.seed(42)
initial_indices = np.random.choice(len(X), initial_train_size, replace=False)
X_train = X[initial_indices]
y_train = y[initial_indices]

# Remaining pool of indices
remaining_indices = list(set(range(len(X))) - set(initial_indices))

Batches: 100%|██████████| 2464/2464 [00:35<00:00, 68.67it/s] 


In [4]:
# Precompute distances from the current training set to all other points in the dataset
# Note: FAISS search returns distances of `k` closest points. We set `k = len(X)` to cover all points.
D_all, I_all = index.search(X, len(X))

In [5]:
# Initialize classifier
clf = MultiOutputClassifier(RandomForestClassifier())

# Active Learning Loop using precomputed FAISS distances
for iteration in range(iterations):
    clf.fit(X_train, y_train)

    # Evaluate classifier on the entire dataset
    y_pred = clf.predict(X)
    print(f"Iteration {iteration + 1}: Accuracy = {accuracy_score(y, y_pred):.4f}")

    # Compute mean distances of all remaining points to the current training set using precomputed FAISS results
    avg_distances = D_all[initial_indices, :].mean(axis=0)

    # Get the valid indices from remaining_indices, and find their furthest points
    valid_avg_distances = np.array([avg_distances[i] for i in remaining_indices])

    # Select the furthest points from the current training set based on valid remaining points
    furthest_indices_within_remaining = np.argsort(valid_avg_distances)[-sample_size:]
    
    # Map these furthest indices back to the actual dataset indices
    selected_indices = [remaining_indices[i] for i in furthest_indices_within_remaining]
    
    # Add the selected samples to the training set
    X_train = np.vstack([X_train, X[selected_indices]])
    y_train = np.concatenate([y_train, y[selected_indices]], axis=0)

    # Remove the selected indices from the pool
    remaining_indices = list(set(remaining_indices) - set(selected_indices))
    
    # Update initial_indices to include the new training samples
    initial_indices = np.concatenate([initial_indices, selected_indices])

    # Stop if there are not enough remaining samples or if the max training set size is reached
    if len(remaining_indices) == 0 or len(remaining_indices) < sample_size or len(X_train) >= 60000:
        break

# Final evaluation on the entire dataset
y_pred_final = clf.predict(X)
final_accuracy = accuracy_score(y, y_pred_final)
print(f"Final Accuracy: {final_accuracy:.4f}")

Iteration 1: Accuracy = 0.1269
Iteration 2: Accuracy = 0.1155
Iteration 3: Accuracy = 0.1347
Iteration 4: Accuracy = 0.1501
Iteration 5: Accuracy = 0.1625
Iteration 6: Accuracy = 0.1753
Iteration 7: Accuracy = 0.1880
Iteration 8: Accuracy = 0.1987
Iteration 9: Accuracy = 0.2121
Iteration 10: Accuracy = 0.2236
Iteration 11: Accuracy = 0.2362
Iteration 12: Accuracy = 0.2471
Iteration 13: Accuracy = 0.2592
Iteration 14: Accuracy = 0.2715
Iteration 15: Accuracy = 0.2821
Iteration 16: Accuracy = 0.2940
Iteration 17: Accuracy = 0.3060


In [None]:
from sklearn.metrics import classification_report


print(classification_report(y, y_pred_final, target_names=mlb.classes_))

In [None]:
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize classifier
clf = MultiOutputClassifier(RandomForestClassifier())

# Active Learning Loop with Random Sampling
for iteration in range(iterations):
    # Train the classifier
    clf.fit(X_train, y_train)

    # Evaluate the classifier on the entire dataset
    y_pred = clf.predict(X)
    print(f"Iteration {iteration + 1}: Accuracy = {accuracy_score(y, y_pred):.4f}")

    # Randomly select points from the remaining pool
    sample_size = min(sample_size, len(remaining_indices))
    selected_indices = np.random.choice(remaining_indices, size=sample_size, replace=False)

    # Add the selected samples to the training set
    X_train = np.vstack([X_train, X[selected_indices]])
    y_train = np.concatenate([y_train, y[selected_indices]], axis=0)

    # Remove the selected indices from the pool
    remaining_indices = list(set(remaining_indices) - set(selected_indices))

    # Stop if there are not enough remaining samples or if the max training set size is reached
    if len(remaining_indices) == 0 or len(X_train) >= 60000:
        break

# Final evaluation on the entire dataset
y_pred_final = clf.predict(X)
final_accuracy = accuracy_score(y, y_pred_final)
print(f"Final Accuracy: {final_accuracy:.4f}")


In [None]:
from sklearn.metrics import classification_report


print(classification_report(y, y_pred_final, target_names=mlb.classes_))

In [None]:
# Compute pairwise distances between all vectors
distance_matrix = squareform(pdist(X, metric='euclidean'))

In [None]:
clf = MultiOutputClassifier(RandomForestClassifier())

# Active Learning Loop with precomputed distances
for iteration in range(iterations):
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X)
    print(f"Iteration {iteration + 1}: Accuracy = {accuracy_score(y, y_pred):.4f}")
    
    # Compute distances between the current training set and the rest of the dataset using precomputed matrix
    D = distance_matrix[initial_indices, :]  # Use the indices from the current training set
    
    # Select the furthest points from the training set
    furthest_indices = np.argsort(D.mean(axis=0))[-sample_size:]
    
    # Add the selected samples to the training set
    X_train = np.vstack([X_train, X[furthest_indices]])
    y_train = np.concatenate([y_train, y[furthest_indices]], axis=0)
    
    # Remove the selected indices from the pool
    remaining_indices = list(set(remaining_indices) - set(furthest_indices))
    
    if len(remaining_indices) == 0 or len(remaining_indices) < sample_size or len(X_train) >= 600:
        break


In [None]:
# Final evaluation
y_pred_final = clf.predict(X)
final_accuracy = accuracy_score(y, y_pred_final)
print(f"Final Accuracy: {final_accuracy:.4f}")