In [1]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from preprocess_data import preprocess_data
# Precompute the pairwise distance matrix (upper triangular)
from scipy.spatial.distance import pdist, squareform

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yarden\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   index                              description_processed  \
0      0   true story notorious australian outlaw ned kelly   
1      1  two men high rank wooing beautiful famous eque...   
2      2  fabled queen egypt affair roman general marc a...   
3      3  loosely adapted dante divine comedy inspired i...   
4      4  account life jesus christ based books new test...   

                         genre  
0    [Biography, Crime, Drama]  
1                      [Drama]  
2             [Drama, History]  
3  [Adventure, Drama, Fantasy]  
4           [Biography, Drama]  
(78843, 3)
Index(['index', 'description_processed', 'genre'], dtype='object')


In [2]:
# Load your data
data = preprocess_data()

# Assuming data is a pandas DataFrame with 'description_processed' and 'genre' columns
descriptions = data['description_processed'].tolist()
genres = data['genre'].tolist()

In [3]:
# Encode the genres as numerical labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(genres)

# Load a pre-trained sentence-transformer model to convert text to embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Convert descriptions to vector embeddings
X = model.encode(descriptions, show_progress_bar=True)

# Initialize FAISS Index for ANN search
embedding_dim = X.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(X)  # Adding all vectors to the index

# Active Learning parameters
initial_train_size = 50  # Initial training set size
iterations = 10  # Number of iterations for active learning
sample_size = 10  # Samples to add per iteration

# Start by selecting the furthest points from each other for initial training
np.random.seed(42)
initial_indices = np.random.choice(len(X), initial_train_size, replace=False)
X_train = X[initial_indices]
y_train = y[initial_indices]

# Remaining pool of indices
remaining_indices = list(set(range(len(X))) - set(initial_indices))

Batches: 100%|██████████| 2464/2464 [12:37<00:00,  3.25it/s]


In [4]:
# Precompute distances from the current training set to all other points in the dataset
# Note: FAISS search returns distances of `k` closest points. We set `k = len(X)` to cover all points.
D_all, I_all = index.search(X, len(X))


MemoryError: Unable to allocate 23.2 GiB for an array with shape (78843, 78843) and data type float32

In [None]:
# Initialize classifier
clf = MultiOutputClassifier(RandomForestClassifier())

# Active Learning Loop using precomputed FAISS distances
for iteration in range(iterations):
    clf.fit(X_train, y_train)

    # Evaluate classifier on the entire dataset
    y_pred = clf.predict(X)
    print(f"Iteration {iteration + 1}: Accuracy = {accuracy_score(y, y_pred):.4f}")

    # Compute mean distances of all remaining points to the current training set using the precomputed FAISS results
    avg_distances = D_all[initial_indices, :].mean(axis=0)
    
    # Select the furthest points from the current training set
    furthest_indices = np.argsort(avg_distances)[-sample_size:]
    
    # Get the corresponding indices from the pool of remaining indices
    selected_indices = [remaining_indices[i] for i in furthest_indices]
    
    # Add the selected samples to the training set
    X_train = np.vstack([X_train, X[selected_indices]])
    y_train = np.concatenate([y_train, y[selected_indices]], axis=0)

    # Remove the selected indices from the pool
    remaining_indices = list(set(remaining_indices) - set(selected_indices))
    
    # Update initial_indices to include the new training samples
    initial_indices = np.concatenate([initial_indices, selected_indices])

    # Stop if there are not enough remaining samples or if the max training set size is reached
    if len(remaining_indices) == 0 or len(remaining_indices) < sample_size or len(X_train) >= 600:
        break

# Final evaluation on the entire dataset
y_pred_final = clf.predict(X)
final_accuracy = accuracy_score(y, y_pred_final)
print(f"Final Accuracy: {final_accuracy:.4f}")

In [5]:
# Compute pairwise distances between all vectors
distance_matrix = squareform(pdist(X, metric='euclidean'))

MemoryError: Unable to allocate 23.2 GiB for an array with shape (3108069903,) and data type float64

In [None]:
clf = MultiOutputClassifier(RandomForestClassifier())

# Active Learning Loop with precomputed distances
for iteration in range(iterations):
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X)
    print(f"Iteration {iteration + 1}: Accuracy = {accuracy_score(y, y_pred):.4f}")
    
    # Compute distances between the current training set and the rest of the dataset using precomputed matrix
    D = distance_matrix[initial_indices, :]  # Use the indices from the current training set
    
    # Select the furthest points from the training set
    furthest_indices = np.argsort(D.mean(axis=0))[-sample_size:]
    
    # Add the selected samples to the training set
    X_train = np.vstack([X_train, X[furthest_indices]])
    y_train = np.concatenate([y_train, y[furthest_indices]], axis=0)
    
    # Remove the selected indices from the pool
    remaining_indices = list(set(remaining_indices) - set(furthest_indices))
    
    if len(remaining_indices) == 0 or len(remaining_indices) < sample_size or len(X_train) >= 600:
        break


In [None]:
# Final evaluation
y_pred_final = clf.predict(X)
final_accuracy = accuracy_score(y, y_pred_final)
print(f"Final Accuracy: {final_accuracy:.4f}")