In [21]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score,classification_report,pairwise_distances_argmin_min,silhouette_score, hamming_loss, f1_score, precision_score, recall_score, jaccard_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from preprocess_data import preprocess_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_distances,pairwise_distances
from sklearn.cluster import KMeans
from collections import Counter
from sklearn.linear_model import SGDClassifier
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier


In [3]:
data = preprocess_data()
descriptions = data['description_processed'].tolist()
genres = data['genre'].tolist()
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(genres)
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
X = model.encode(descriptions, show_progress_bar=True)

Batches: 100%|██████████| 2464/2464 [03:27<00:00, 11.87it/s]


In [101]:
initial_train_size = 1000
iterations = 10
uncertainty_threshold = 0.2
budget_per_iteration = 1000    

In [6]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_full_normalized = normalize(X_train_full, axis=1, norm='l2')
X_test_normalized = normalize(X_test, axis=1, norm='l2')
np.random.seed(42)
initial_indices = np.random.choice(len(X_train_full_normalized), initial_train_size, replace=False)

In [20]:
classes = [list(range(y_item.max() + 1)) for y_item in y.T]
clf = MultiOutputClassifier(SGDClassifier(loss='log_loss', random_state=42))

In [104]:
methods = ['random', 'cluster']

In [100]:
def sample(method, X_train, X_pool, remaining_indices, budget):
    if method == 'random':
        return np.random.choice(remaining_indices, budget, replace=False)
    
    elif method == 'cluster':
        num_clusters = 20
        clustering = faiss.Clustering(X_train.shape[1], num_clusters)
        clustering.verbose = False
        clustering.niter = 50
        remaining_data = np.array([X_pool[i] for i in remaining_indices]).astype('float32')
        index_flat = faiss.IndexFlatIP(X_train.shape[1])
        clustering.train(remaining_data, index_flat)
        D, cluster_assignments = index_flat.search(remaining_data, 1)
        centroids = faiss.vector_to_array(clustering.centroids).reshape(num_clusters, X_train.shape[1])

        selected_indices = []
        for cluster in range(num_clusters):
            cluster_indices = [i for i, label in zip(remaining_indices, cluster_assignments) if label == cluster]
            if cluster_indices:
                num_to_select_from_cluster = min((int((500/num_clusters)/2)), int(len(cluster_indices)/2))
                cluster_center = centroids[cluster].reshape(1, -1)
                distances, _ = index_flat.search(cluster_center, len(cluster_indices))
                closest_samples = [cluster_indices[i] for i in distances[0].argsort()[:num_to_select_from_cluster]]
                selected_indices.extend(closest_samples)

                cluster_data = X_pool[cluster_indices]
                pairwise_distances_matrix = pairwise_distances(cluster_data)
                diversity_scores = pairwise_distances_matrix.mean(axis=1)
                most_diverse_indices = np.argsort(-diversity_scores)[:num_to_select_from_cluster]
                diverse_samples = [cluster_indices[i] for i in most_diverse_indices]
                selected_indices.extend(diverse_samples)

        return selected_indices

In [105]:
acc_dict, f1_dict, loss_dict, jaccard_dict = {}, {}, {}, {}
for method in methods:
    print(f'Running active learning with {method} sampling')
    clf = MultiOutputClassifier(SGDClassifier(loss='log_loss', random_state=42))
    accuracies, f1_scores, losses, jaccards = [], [], [], []
    X_train = X_train_full_normalized[initial_indices]
    y_train = np.array(y_train_full)[initial_indices]
    remaining_indices = list(set(range(len(X_train_full_normalized))) - set(initial_indices))
    for iteration in range(iterations):
        clf.partial_fit(X_train, y_train, classes=classes)
        y_pred = clf.predict(X_test_normalized)
        acc = accuracy_score(y_test, y_pred)
        loss = hamming_loss(y_test, y_pred)
        jaccard = jaccard_score(y_test, y_pred, average='samples')
        f1 = f1_score(y_test, y_pred, average='micro')
        accuracies.append(acc)   
        losses.append(loss)
        jaccards.append(jaccard)
        f1_scores.append(f1)
        print(f'Iteration {iteration + 1}/{iterations} - Accuracy: {acc}, Hamming Loss: {loss}, Jaccard: {jaccard}, F1: {f1}')
        selected_indices = sample(method, X_train, X_train_full_normalized, remaining_indices, budget_per_iteration)
        X_train = np.vstack((X_train, X_train_full_normalized[selected_indices]))
        y_train = np.concatenate((y_train, np.array(y_train_full)[selected_indices]))
        remaining_indices = list(set(remaining_indices) - set(selected_indices))
        
    print(classification_report(y_test, y_pred, target_names=mlb.classes_))
    acc_dict[method] = accuracies
    f1_dict[method] = f1_scores
    loss_dict[method] = losses
    jaccard_dict[method] = jaccards
        

Running active learning with random sampling
Iteration 1/10 - Accuracy: 0.0025366224871583485, Hamming Loss: 0.1457948165095099, Jaccard: 0.18969627972101177, F1: 0.29690532481738946
Iteration 2/10 - Accuracy: 0.038302999556091066, Hamming Loss: 0.10351858807689869, Jaccard: 0.24768480351744987, F1: 0.3634400215976243
Iteration 3/10 - Accuracy: 0.1250554886169066, Hamming Loss: 0.0745962136031259, Jaccard: 0.36679259185442203, F1: 0.4764985793023176
Iteration 4/10 - Accuracy: 0.16044137231276556, Hamming Loss: 0.06734488797397035, Jaccard: 0.3873369691588982, F1: 0.4898094938930875
Iteration 5/10 - Accuracy: 0.1644999682922189, Hamming Loss: 0.06623999375600619, Jaccard: 0.4176517217325131, F1: 0.5149317710938058
Iteration 6/10 - Accuracy: 0.16304141036210287, Hamming Loss: 0.06498387781284604, Jaccard: 0.35818166444712196, F1: 0.4598479472883933
Iteration 7/10 - Accuracy: 0.17515378273828397, Hamming Loss: 0.06386922735454666, Jaccard: 0.39905405119749715, F1: 0.4976596071209331
Itera

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Iteration 1/10 - Accuracy: 0.0025366224871583485, Hamming Loss: 0.1457948165095099, Jaccard: 0.18969627972101177, F1: 0.29690532481738946
Iteration 2/10 - Accuracy: 0.07546451899296087, Hamming Loss: 0.09163548734859535, Jaccard: 0.29287873434617046, F1: 0.3999361124421019
Iteration 3/10 - Accuracy: 0.11192846724586214, Hamming Loss: 0.07890603277121129, Jaccard: 0.39095067477177947, F1: 0.4936215505501902
Iteration 4/10 - Accuracy: 0.10279662629209207, Hamming Loss: 0.07996214578749933, Jaccard: 0.3723459832280937, F1: 0.4764946346448646
Iteration 5/10 - Accuracy: 0.1368507831821929, Hamming Loss: 0.07317911969443455, Jaccard: 0.35381444606506435, F1: 0.4537062325886273
Iteration 6/10 - Accuracy: 0.1345678229437504, Hamming Loss: 0.06946930930696547, Jaccard: 0.3251928890016276, F1: 0.4308380959993605
Iteration 7/10 - Accuracy: 0.15308516710000633, Hamming Loss: 0.06837417132933653, Jaccard: 0.40570951444817893, F1: 0.5046910613636765
Iteration 8/10 - Accuracy: 0.1466801953199315, Ham

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
