In [2]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from preprocess_data import preprocess_data
import pandas as pd
import pickle

In [None]:
# test git here

In [18]:

# Load your data
data = preprocess_data()

# Assuming data is a pandas DataFrame with 'description_processed' and 'genre' columns
descriptions = data['description_processed'].tolist()
genres = data['genre'].tolist()

In [23]:
# Define the sample size
sample_size = 40000  # Total number of samples to take from the full dataset

# Randomly sample indices for the subset
np.random.seed(42)
total_samples = len(descriptions)
sample_indices = np.random.choice(total_samples, sample_size, replace=False)

# Create the sampled descriptions and genres
sampled_descriptions = [descriptions[i] for i in sample_indices]
sampled_genres = [genres[i] for i in sample_indices]

# Encode the genres as numerical labels
mlb = MultiLabelBinarizer()
y_sampled = mlb.fit_transform(sampled_genres)

# Load a pre-trained sentence-transformer model to convert text to embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Convert descriptions to vector embeddings
X_sampled = model.encode(sampled_descriptions, show_progress_bar=True)

# Split the sampled data into train and test sets
X_train_sampled, X_test_sampled, y_train_sampled, y_test_sampled = train_test_split(
    X_sampled, y_sampled, test_size=0.2, random_state=42
)

# List of metrics to try
metrics = ['L2', 'InnerProduct', 'Cosine']

Batches: 100%|██████████| 1250/1250 [00:17<00:00, 71.39it/s] 


In [24]:

for metric in metrics:
    print(f"\nUsing metric: {metric}")

    # Initialize FAISS Index for the current metric
    embedding_dim = X_train_sampled.shape[1]
    
    if metric == 'L2':
        index = faiss.IndexFlatL2(embedding_dim)
    elif metric == 'InnerProduct':
        index = faiss.IndexFlatIP(embedding_dim)
    elif metric == 'Cosine':
        index = faiss.IndexFlatIP(embedding_dim)
        # Normalize the embeddings for cosine similarity
        X_train_sampled = X_train_sampled / np.linalg.norm(X_train_sampled, axis=1, keepdims=True)
        X_test_sampled = X_test_sampled / np.linalg.norm(X_test_sampled, axis=1, keepdims=True)
    else:
        raise ValueError(f"Unsupported metric: {metric}")
    
    index.add(X_train_sampled)  # Adding all vectors to the index

    # Active Learning parameters
    initial_train_size = 50  # Initial training set size
    iterations = 100  # Number of iterations for active learning
    sample_size = 1000  # Samples to add per iteration

    # Start by selecting the furthest points from each other for initial training
    np.random.seed(42)
    initial_indices = np.random.choice(len(X_train_sampled), initial_train_size, replace=False)
    X_train = X_train_sampled[initial_indices]
    y_train = y_train_sampled[initial_indices]

    # Remaining pool of indices
    remaining_indices = list(set(range(len(X_train_sampled))) - set(initial_indices))

    # Precompute distances from the current training set to all other points in the sampled training dataset
    D_all, I_all = index.search(X_train_sampled, len(X_train_sampled))

    # Initialize classifier
    clf = MultiOutputClassifier(RandomForestClassifier())

    # Active Learning Loop using precomputed FAISS distances
    for iteration in range(iterations):
        clf.fit(X_train, y_train)

        # Evaluate classifier on the test set
        y_pred_test = clf.predict(X_test_sampled)
        print(f"Iteration {iteration + 1}: Test Accuracy = {accuracy_score(y_test_sampled, y_pred_test):.4f}")

        # Compute mean distances of all remaining points to the current training set using precomputed FAISS results
        avg_distances = D_all[initial_indices, :].mean(axis=0)

        # Get the valid indices from remaining_indices, and find their furthest points
        valid_avg_distances = np.array([avg_distances[i] for i in remaining_indices])

        # Select the furthest points from the current training set based on valid remaining points
        furthest_indices_within_remaining = np.argsort(valid_avg_distances)[-sample_size:]
        
        # Map these furthest indices back to the actual dataset indices
        selected_indices = [remaining_indices[i] for i in furthest_indices_within_remaining]
        
        # Add the selected samples to the training set
        X_train = np.vstack([X_train, X_train_sampled[selected_indices]])
        y_train = np.concatenate([y_train, y_train_sampled[selected_indices]], axis=0)

        # Remove the selected indices from the pool
        remaining_indices = list(set(remaining_indices) - set(selected_indices))
        
        # Update initial_indices to include the new training samples
        initial_indices = np.concatenate([initial_indices, selected_indices])

        # Stop if there are not enough remaining samples or if the max training set size is reached
        if len(remaining_indices) == 0 or len(remaining_indices) < sample_size or len(X_train) >= 60000:
            break

    # Final evaluation on the test set
    y_pred_final = clf.predict(X_test_sampled)
    final_accuracy = accuracy_score(y_test_sampled, y_pred_final)
    print(f"Final Test Accuracy using {metric}: {final_accuracy:.4f}")



Using metric: L2
Iteration 1: Test Accuracy = 0.0732
Iteration 2: Test Accuracy = 0.1261
Iteration 3: Test Accuracy = 0.1263
Iteration 4: Test Accuracy = 0.1259
Iteration 5: Test Accuracy = 0.1295
Iteration 6: Test Accuracy = 0.1338
Iteration 7: Test Accuracy = 0.1345
Iteration 8: Test Accuracy = 0.1339
Iteration 9: Test Accuracy = 0.1320
Iteration 10: Test Accuracy = 0.1353
Iteration 11: Test Accuracy = 0.1318
Iteration 12: Test Accuracy = 0.1335
Iteration 13: Test Accuracy = 0.1358
Iteration 14: Test Accuracy = 0.1368
Iteration 15: Test Accuracy = 0.1325
Iteration 16: Test Accuracy = 0.1338
Iteration 17: Test Accuracy = 0.1353
Iteration 18: Test Accuracy = 0.1343
Iteration 19: Test Accuracy = 0.1354
Iteration 20: Test Accuracy = 0.1341


KeyboardInterrupt: 