In [None]:
import pickle
import pandas as pd

#open pickle files
with open('x_training.pkl', 'rb') as file:
    X_training = pickle.load(file)

with open('y_training.pkl', 'rb') as file:
    y_training = pickle.load(file)

with open('evaluation_x.pkl', 'rb') as file:
    evaluation_x = pickle.load(file)

with open('evaluation_y.pkl', 'rb') as file:
    evaluation_y = pickle.load(file)

with open('Y_pool.pkl', 'rb') as file:
    Y_pool = pickle.load(file)

with open('X_pool', 'rb') as file:
   X_pool = pickle.load(file)

with open('embeddings_unlabeled', 'rb') as file:
    embeddings_unlabeled = pickle.load(file)


Set up for Active Learner

In [None]:
import sys
from sklearn.neural_network import MLPClassifier
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from typing import Callable
import numpy as np
from matplotlib import pyplot as plt 
from joblib import dump, load


pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)  

class DeepActiveLearner(ActiveLearner):

    def __init__(self,
                 estimator: MLPClassifier,
                 query_strategy: Callable = uncertainty_sampling,
                 on_transformed: bool = False,
                 X_training=None,
                 y_training=None,
                 **fit_kwargs
                 ) -> None:
        super().__init__(estimator, query_strategy, X_training, y_training, on_transformed, **fit_kwargs)

 
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500)

deep_active_learner = load('active_learner.joblib')

deep_active_learner.fit(X_training, y_training)

# query for labels
n_queries = 25
accuracy_scores = [deep_active_learner.score(evaluation_x, evaluation_y)]
for idx in range(n_queries):
    # Query for labels from the pool of unlabeled instances
    query_idx, query_instance = deep_active_learner.query(X_pool)

    ## Handling the KeyError scenario
    if np.atleast_1d(query_idx).flatten()[0] in embeddings_unlabeled.index:
        print(embeddings_unlabeled.loc[query_idx]['combined_text'])
    else:
        print(f"Index {query_idx} not found in embeddings_unlabeled. Skipping...")
        continue

    sys.stdout.flush()

    # Get the label from the human annotator
    while True:
        user_input = input("Enter the label (0, 1, 2): ")
        try:
            label = int(user_input)
            if label in [0, 1, 2]:
                break
            else:
                print("Label must be one of the following: 0, 1, 2")
        except ValueError:
            print("Invalid input. Please enter a numeric value.")
    
    query_idx = np.array(query_idx, dtype=int)
    
    # Update Y_pool with new labels
    Y_pool[query_idx] = label

    # Supply labels for queried instances
    deep_active_learner.teach(X_pool[query_idx], Y_pool[query_idx])

    # Update the pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    Y_pool = np.delete(Y_pool, query_idx, axis=0)

    # After updating X_pool and Y_pool
    embeddings_unlabeled = embeddings_unlabeled.drop(query_idx)

    # Filtering indices to drop
    indices_to_drop = [index for index in np.atleast_1d(query_idx).flatten() if index in embeddings_unlabeled.index]
    embeddings_unlabeled = embeddings_unlabeled.drop(indices_to_drop)

    accuracy_scores.append(deep_active_learner.score(evaluation_x, evaluation_y))
    print(accuracy_scores)

    #save active learner
    dump(deep_active_learner, 'active_learner.joblib')

plt.figure(figsize=(10, 5))
plt.title('Accuracy of the classifier during the active learning')
plt.plot(range(len(accuracy_scores)), accuracy_scores)  
plt.scatter(range(len(accuracy_scores)), accuracy_scores)  
plt.xlabel('number of queries')
plt.ylabel('accuracy')
plt.show()

accuracy_df = pd.DataFrame(accuracy_scores)
accuracy_df.to_csv('accuracies_MLP3.csv', index=False)


Predict to csv

In [None]:
predicted_labels = deep_active_learner.predict(X_pool)
embeddings_unlabeled['predicted_label'] = predicted_labels

for _, row in embeddings_unlabeled.iterrows():
    if row['predicted_label'] in [1]:
        print(f"Text: {row['combined_text']}\nPredicted Label: {row['predicted_label']}\n")

filtered_df = embeddings_unlabeled[embeddings_unlabeled['predicted_label']]

# Save the filtered DataFrame to a CSV file
filtered_df.to_csv('predictions_test.csv', columns=['combined_text', 'predicted_label'], index=False)


