In [None]:
import json
import random
import time

import matplotlib.pyplot as plt
import numpy as np
from datasets import load_dataset
from keras.layers import Embedding, Flatten, Dense, Conv1D, MaxPooling1D, SimpleRNN, LSTM, TextVectorization
from keras.losses import CategoricalCrossentropy
from keras.models import Sequential
from keras.utils import plot_model
from scipy import sparse
from scipy.stats import entropy
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

In [None]:
random_seed = 41
result = {
    "dataset": "go_emotions",  # dbpedia | imdb_reviews | tweet_eval | sst-2 | go_emotions | wine_reviews
    "train_seed": 43,
    "train_count": 10000,
    "test_seed": 43,
    "test_count": 2000,
    "stopword_removal": True,  # True | False
    "embedding": "tfidf",  # glove | tfidf
    "model": "SVM",  # CNN | RNN | LSTM | SVM | LR | D-TREE | R-FOREST | NBC
    "seed_size": 100,
    "batch_size": 100,
    "num_steps": 29,
    "al_methods": ["rs", "lf", "lc", "ms", "es"],
    "durations": {},
    "accuracies": {},
    "f1_scores": {}
}

In [None]:
import nltk

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)
print(len(stopwords))


# https://spotintelligence.com/2022/12/10/stop-words-removal
def remove_stop_words(sentence, additional_stopwords=None):
    # Split the sentence into individual words
    if additional_stopwords is None:
        additional_stopwords = []
    words = sentence.split()
    # Use a list comprehension to remove stop words
    filtered_words = [word for word in words if word not in stopwords and word not in additional_stopwords]
    # Join the filtered words back into a sentence
    return ' '.join(filtered_words)

In [None]:
# https://huggingface.co/datasets/DeveloperOats/DBPedia_Classes
def dataset_dbpedia():
    print("Loading DBPedia Dataset")
    dataset = load_dataset("DeveloperOats/DBPedia_Classes")
    dbpedia_train = dataset['train'].shuffle(seed=result["train_seed"])
    dbpedia_train_x = np.array(dbpedia_train['text'][:result["train_count"]])
    dbpedia_train_y = np.array(dbpedia_train['l1'][:result["train_count"]])
    dbpedia_test = dataset['test'].shuffle(seed=result["test_seed"])
    dbpedia_test_x = np.array(dbpedia_test['text'][:result["test_count"]])
    dbpedia_test_y = np.array(dbpedia_test['l1'][:result["test_count"]])

    # Remove stopwords
    if result["stopword_removal"]:
        print("Stopwords Removing")
        dbpedia_train_x = [remove_stop_words(i) for i in dbpedia_train_x]
        dbpedia_test_x = [remove_stop_words(i) for i in dbpedia_test_x]
    return dbpedia_train_x, dbpedia_train_y, dbpedia_test_x, dbpedia_test_y

In [None]:
# https://huggingface.co/datasets/stanfordnlp/imdb
def dataset_imdb_reviews():
    print("Loading Imdb Reviews Dataset")
    dataset = load_dataset("stanfordnlp/imdb")
    imdb_train = dataset['train'].shuffle(seed=result["train_seed"])
    imdb_train_x = np.array(imdb_train['text'][:result["train_count"]])
    imdb_train_y = np.array(imdb_train['label'][:result["train_count"]])
    imdb_test = dataset['test'].shuffle(seed=result["test_seed"])
    imdb_test_x = np.array(imdb_test['text'][:result["test_count"]])
    imdb_test_y = np.array(imdb_test['label'][:result["test_count"]])

    imdb_train_x = [i.replace("<br />", "") for i in imdb_train_x]
    imdb_test_x = [i.replace("<br />", "") for i in imdb_test_x]

    # Remove stopwords
    if result["stopword_removal"]:
        print("Stopwords Removing")
        imdb_train_x = [remove_stop_words(i) for i in imdb_train_x]
        imdb_test_x = [remove_stop_words(i) for i in imdb_test_x]
    return imdb_train_x, imdb_train_y, imdb_test_x, imdb_test_y

In [None]:
# https://huggingface.co/datasets/cardiffnlp/tweet_eval
def dataset_tweet_eval():
    print("Loading Tweet Eval Dataset")
    dataset = load_dataset("tweet_eval", "emotion")
    tweet_train = dataset['train'].shuffle(seed=result["train_seed"])
    tweet_train_x = np.array(tweet_train['text'][:result["train_count"]])
    tweet_train_y = np.array(tweet_train['label'][:result["train_count"]])
    tweet_test = dataset['test'].shuffle(seed=result["test_seed"])
    tweet_test_x = np.array(tweet_test['text'][:result["test_count"]])
    tweet_test_y = np.array(tweet_test['label'][:result["test_count"]])
    # Remove stopwords
    if result["stopword_removal"]:
        print("Stopwords Removing")
        tweet_train_x = [remove_stop_words(i, ["@user"]) for i in tweet_train_x]
        tweet_test_x = [remove_stop_words(i, ["@user"]) for i in tweet_test_x]
    return tweet_train_x, tweet_train_y, tweet_test_x, tweet_test_y

In [None]:
# https://huggingface.co/datasets/stanfordnlp/sst2
def dataset_sst2():
    print("Loading SST-2 Dataset")
    dataset = load_dataset("stanfordnlp/sst2")
    sst2_train = dataset['train'].shuffle(seed=result["train_seed"])
    sst2_train_x = np.array(sst2_train['sentence'][:result["train_count"]])
    sst2_train_y = np.array(sst2_train['label'][:result["train_count"]])
    sst2_test = dataset['validation'].shuffle(seed=result["test_seed"])
    sst2_test_x = np.array(sst2_test['sentence'][:result["test_count"]])
    sst2_test_y = np.array(sst2_test['label'][:result["test_count"]])
    # Remove stopwords
    if result["stopword_removal"]:
        print("Stopwords Removing")
        sst2_train_x = [remove_stop_words(i) for i in sst2_train_x]
        sst2_test_x = [remove_stop_words(i) for i in sst2_test_x]
    return sst2_train_x, sst2_train_y, sst2_test_x, sst2_test_y

In [None]:
# google-research-datasets/go_emotions
def dataset_go_emotions():
    print("Loading Go Emotions Dataset")
    dataset = load_dataset("google-research-datasets/go_emotions", "simplified")
    dataset = dataset.filter(lambda x: len(x['labels']) == 1)
    go_emotions_train = dataset['train'].shuffle(seed=result["train_seed"])
    go_emotions_train_x = np.array(go_emotions_train['text'][:result["train_count"]])
    go_emotions_train_y = np.array([i[0] for i in go_emotions_train['labels']][:result["train_count"]])
    go_emotions_test = dataset['test'].shuffle(seed=result["test_seed"])
    go_emotions_test_x = np.array(go_emotions_test['text'][:result["test_count"]])
    go_emotions_test_y = np.array([i[0] for i in go_emotions_test['labels']][:result["test_count"]])
    # Remove stopwords
    if result["stopword_removal"]:
        print("Stopwords Removing")
        go_emotions_train_x = [remove_stop_words(i) for i in go_emotions_train_x]
        go_emotions_test_x = [remove_stop_words(i) for i in go_emotions_test_x]
    return go_emotions_train_x, go_emotions_train_y, go_emotions_test_x, go_emotions_test_y

In [None]:
# https://huggingface.co/datasets/james-burton/wine_reviews
def dataset_wine_reviews():
    print("Loading Wine Reviews Dataset")
    dataset = load_dataset("james-burton/wine_reviews")
    go_emotions_train = dataset['train'].shuffle(seed=result["train_seed"])
    go_emotions_train_x = np.array(go_emotions_train['description'][:result["train_count"]])
    go_emotions_train_y = np.array(go_emotions_train['points'][:result["train_count"]])
    go_emotions_test = dataset['test'].shuffle(seed=result["test_seed"])
    go_emotions_test_x = np.array(go_emotions_test['description'][:result["test_count"]])
    go_emotions_test_y = np.array(go_emotions_test['points'][:result["test_count"]])
    # Remove stopwords
    if result["stopword_removal"]:
        print("Stopwords Removing")
        go_emotions_train_x = [remove_stop_words(i) for i in go_emotions_train_x]
        go_emotions_test_x = [remove_stop_words(i) for i in go_emotions_test_x]
    return go_emotions_train_x, go_emotions_train_y, go_emotions_test_x, go_emotions_test_y

In [None]:
if result["dataset"] == "dbpedia":
    train_x, train_y, val_x, val_y = dataset_dbpedia()
elif result["dataset"] == "imdb_reviews":
    train_x, train_y, val_x, val_y = dataset_imdb_reviews()
elif result["dataset"] == "tweet_eval":
    train_x, train_y, val_x, val_y = dataset_tweet_eval()
elif result["dataset"] == "sst-2":
    train_x, train_y, val_x, val_y = dataset_sst2()
elif result["dataset"] == "go_emotions":
    train_x, train_y, val_x, val_y = dataset_go_emotions()
elif result["dataset"] == "wine_reviews":
    train_x, train_y, val_x, val_y = dataset_wine_reviews()
else:
    raise Exception("Unknown Database")

print("Shape TrainX:%s , TrainY:%s" % (np.shape(train_x), np.shape(train_y)))
print("Shape ValX:%s , ValY:%s" % (np.shape(val_x), np.shape(val_y)))

In [None]:
def embedding_layer_glove(train_data, val_data):
    print("Loading GloVe Embedding")
    result["embedding"] = "glove"

    embedding_dim = 100  # Embedding each vector dimension value
    sequence_length = 100  # This is constant of the sentence word length

    # Text Vectorization process for both train and validation data
    text_vectorization = TextVectorization(output_sequence_length=sequence_length)
    text_vectorization.adapt(train_data)
    vocabulary = text_vectorization.get_vocabulary()
    word_index = dict(zip(vocabulary, range(len(vocabulary))))
    vocabulary_size = len(word_index)
    print("Vocabulary Size: ", vocabulary_size)
    train_x_seq = text_vectorization(train_data).numpy()
    val_x_seq = text_vectorization(val_data).numpy()

    # Reading All Embedding Vectors
    embedding_index = {}
    f = open('./glove.6B.100d.txt', encoding='utf8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
    f.close()
    print('GloVe has %s word vectors ' % len(embedding_index))

    # Mapping Text Vectorization to Embedding
    embedding_size = vocabulary_size + 1
    embedding_matrix = np.zeros((embedding_size, embedding_dim))
    for word, idx in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

    return train_x_seq, val_x_seq, Embedding(embedding_size, embedding_dim, input_length=sequence_length,
                                             weights=[embedding_matrix], trainable=False)

In [None]:
if result["embedding"] == "glove":
    train_X, val_X, embedding_layer = embedding_layer_glove(train_x, val_x)
elif result["embedding"] == "tfidf":
    vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
    train_X = vectorizer.fit_transform(train_x).toarray()
    val_X = vectorizer.transform(val_x).toarray()
else:
    raise Exception("Unknown Embedding")

In [None]:
one_hot_encoder = OneHotEncoder()
one_hot_encoder.fit(train_y.reshape(-1, 1))
train_Y = one_hot_encoder.transform(train_y.reshape(-1, 1)).toarray()
val_Y = one_hot_encoder.transform(val_y.reshape(-1, 1)).toarray()

In [None]:
class Trainer:
    def __init__(self, class_count):
        self.class_count = class_count

    def __create_model(self):
        if result["model"] == "CNN":
            self.model = self.__create_conv_model()
        elif result["model"] == "RNN":
            self.model = self.__create_rnn_model()
        elif result["model"] == "LSTM":
            self.model = self.__create_lstm_model()
        elif result["model"] == "SVM":
            self.model = self.__create_svm_model()
        elif result["model"] == "LR":
            self.model = self.__create_logistic_regression_model()
        elif result["model"] == "D-TREE":
            self.model = self.__create_decision_tree_model()
        elif result["model"] == "R-FOREST":
            self.model = self.__create_random_forest_model()
        elif result["model"] == "NBC":
            self.model = self.__create_naive_bayes_model()
        else:
            raise Exception("Unknown Model")

    def __create_conv_model(self):
        print("Creating CNN Model")
        model = Sequential()
        model.add(embedding_layer)
        model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Flatten())
        model.add(Dense(20, activation="sigmoid"))
        model.add(Dense(self.class_count, activation="softmax"))
        model.compile(optimizer="Adam", loss=CategoricalCrossentropy(), metrics=['accuracy'])
        return model

    def __create_rnn_model(self):
        print("Creating RNN Model")
        model = Sequential()
        model.add(embedding_layer)
        model.add(SimpleRNN(units=10))
        model.add(Dense(self.class_count, activation="softmax"))
        model.compile(optimizer="Adam", loss=CategoricalCrossentropy(), metrics=['accuracy'])
        return model

    def __create_lstm_model(self):
        print("Creating LSTM Model")
        model = Sequential()
        model.add(embedding_layer)
        model.add(LSTM(units=10))
        model.add(Dense(self.class_count, activation="softmax"))
        model.compile(optimizer="Adam", loss=CategoricalCrossentropy(), metrics=['accuracy'])
        return model

    def __create_svm_model(self):
        print("Creating SVM Model")
        model = svm.SVC(kernel='linear', probability=True)
        return model

    def __create_logistic_regression_model(self):
        print("Creating LR Model")
        model = LogisticRegression()
        return model

    def __create_decision_tree_model(self):
        print("Creating D-TREE Model")
        model = DecisionTreeClassifier(random_state=random_seed)
        return model

    def __create_random_forest_model(self):
        print("Creating R-FOREST Model")
        model = RandomForestClassifier(random_state=random_seed)
        return model

    def __create_naive_bayes_model(self):
        print("Creating NBC Model")
        model = MultinomialNB()
        return model

    def train_model(self, X, Y, pool=None):
        self.__create_model()
        if result["model"] in ["SVM", "LR", "D-TREE", "R-FOREST", "NBC"]:
            return self.train_sklearn(X, Y, pool)
        else:
            return self.train_keras(X, Y, pool)

    def train_keras(self, X, Y, pool=None):
        self.model.fit(X, Y, epochs=20, verbose=0)

        val_y_classes = [np.argmax(p) for p in val_Y]
        val_predict_classes = [np.argmax(p) for p in self.model.predict(val_X)]
        val_acc = accuracy_score(val_y_classes, val_predict_classes)
        val_f1 = f1_score(val_y_classes, val_predict_classes, average='weighted')
        pool_predictions = self.model.predict(pool) if pool is not None else None
        return val_acc, val_f1, pool_predictions

    def train_sklearn(self, X, Y, pool=None):
        sparse_X = sparse.csr_matrix(X)
        y_classes = one_hot_encoder.inverse_transform(Y).ravel()
        self.model.fit(sparse_X, y_classes)

        sparse_val_X = sparse.csr_matrix(val_X)
        val_predict = self.model.predict(sparse_val_X)
        val_acc = accuracy_score(val_y, val_predict)
        val_f1 = f1_score(val_y, val_predict, average='weighted')
        pool_predictions = self.model.predict_proba(pool) if pool is not None else None
        return val_acc, val_f1, pool_predictions

    def plot_model(self):
        if result["model"] in ["SVM", "LR", "D-TREE", "R-FOREST", "NBC"]:
            return
        plot_model(
            self.model,
            to_file="model.png",
            show_shapes=True,
            show_dtype=True,
            show_layer_names=True,
            rankdir="TB",
            expand_nested=False,
            dpi=96,
            layer_range=None,
            show_layer_activations=False,
            show_trainable=False,
        )

In [None]:
class Dataset:
    def __init__(self, X, Y):
        self._X = X
        self._Y = Y
        self._labeled = np.array([False for _ in range(0, len(self._X))])
        self._selections = []
        self._prev_predictions = np.zeros(np.shape(Y))
        self._long_first_sorted = None

    @property
    def pool(self):
        return self._X

    @property
    def labeled(self):
        return self._labeled

    @property
    def X(self):
        return self._X[self._labeled]

    @property
    def Y(self):
        return self._Y[self._labeled]

    @property
    def selections(self):
        return self._selections

    def random_sampling(self, batch_count):
        not_labeled = np.where(self._labeled == False)[0]
        new_labels = []
        while len(new_labels) < batch_count:
            r = random.randrange(0, len(not_labeled))
            if not_labeled[r] not in new_labels:
                new_labels.append(not_labeled[r])
        self._labeled[new_labels] = True
        self._iteration_selections()

    def long_first(self, batch_count):
        # Data icerisinde tokenlar rakam olarak tutuluyor. Boolean a çevirip toplanarak token sayısı elde ediliyor
        # Data degismedigi icin tekrar tekrar bu islem yapilmiyor
        if self._long_first_sorted is None:
            self._long_first_sorted = sorted([(sum(p > 0), i) for i, p in enumerate(self._X)], reverse=True)
        self._label_batch(self._long_first_sorted, batch_count)
        self._iteration_selections()

    # Least Confidence
    def lc_sampling(self, batch_count, predictions):
        # En iyi olasılığa sahip sınıfın olasılığı 1 den çıkarılıyor.
        lc = sorted([(1 - p[np.argmax(p)], i) for i, p in enumerate(predictions)], reverse=True)
        self._label_batch(lc, batch_count)
        self._iteration_selections()

    # Margin
    def margin_sampling(self, batch_count, predictions):
        # En iyi olasılığa sahip sınıfların olasılıkları arasındaki fark alınıyor
        ms = sorted([(p[np.argsort(p)[-1]] - p[np.argsort(p)[-2]], i) for i, p in enumerate(predictions)])
        self._label_batch(ms, batch_count)
        self._iteration_selections()

    # Entropy
    def entropy_sampling(self, batch_count, predictions):
        # Olasılık değerleri için entropy değeri hesaplanıyor
        es = sorted([(entropy(p), i) for i, p in enumerate(predictions)], reverse=True)
        self._label_batch(es, batch_count)
        self._iteration_selections()

    def _label_batch(self, sorted_candidates, batch_count):
        i = 0
        for _, j in sorted_candidates:
            if not self._labeled[j]:  #if not already labeled
                self._labeled[j] = True
                i += 1
            if i >= batch_count:
                break

    def _iteration_selections(self):
        all_selections = self.labeled.nonzero()[0]
        iteration_selections = [i for i in all_selections if not any(i in s_list for s_list in self.selections)]
        self._selections.append(iteration_selections)

In [None]:
def write_result_file():
    with open('result_v1.json', 'w') as file:
        json.dump(result, file)


write_result_file()

In [None]:
def active_learning(query_strategy, seed_size, batch_size, num_steps):
    """
    query_strategy - 'lc' for Least confidence sampling
                   - 'ms' for Margin sampling
                   - 'es' for Entropy sampling
                   - 'rs' for Random sampling
                   - 'lf' for Long First sampling
    """
    assert query_strategy in ["lc", "ms", "ms2", "es", "rs", "lf"], "Unknown query strategy"
    random.seed(random_seed)
    start_time = time.time()
    accuracies = []
    f1_scores = []
    class_count = len(np.unique(train_y))
    t = Trainer(class_count)  #
    d = Dataset(train_X, train_Y)
    d.random_sampling(seed_size)
    acc, f1, predictions = t.train_model(d.X, d.Y, d.pool)
    accuracies.append(acc)
    f1_scores.append(f1)
    t.plot_model()
    for _ in tqdm(range(0, num_steps)):
        if query_strategy == "lc":
            d.lc_sampling(batch_size, predictions)
        elif query_strategy == "ms":
            d.margin_sampling(batch_size, predictions)
        elif query_strategy == "es":
            d.entropy_sampling(batch_size, predictions)
        elif query_strategy == "rs":
            d.random_sampling(batch_size)
        elif query_strategy == "lf":
            d.long_first(batch_size)

        acc, f1, predictions = t.train_model(d.X, d.Y, d.pool)
        print("Accuracy: %f" % acc)
        accuracies.append(acc)
        f1_scores.append(f1)

    end_time = time.time()
    result["accuracies"][query_strategy] = accuracies
    result["f1_scores"][query_strategy] = f1_scores
    result["durations"][query_strategy] = end_time - start_time
    write_result_file()
    return accuracies, d.selections

In [None]:
seed_size = result["seed_size"]
batch_size = result["batch_size"]
num_steps = result["num_steps"]

In [None]:
full_class_count = len(np.unique(train_y))
full_trainer = Trainer(full_class_count)
full_acc, full_f1, ful_prob = full_trainer.train_model(train_X, train_Y)
print("Full Accuracy: ", full_acc)
print("Full F1 Score: ", full_f1)

In [None]:
random_accuracies, random_selections = active_learning("rs", seed_size, batch_size, num_steps)

In [None]:
lc_accuracies, lc_selections = active_learning("lc", seed_size, batch_size, num_steps)

In [None]:
ms_accuracies, ms_selections = active_learning("ms", seed_size, batch_size, num_steps)

In [None]:
es_accuracies, es_selections = active_learning("es", seed_size, batch_size, num_steps)

In [None]:
longfirst_accuracies, longfirst_selections = active_learning("lf", seed_size, batch_size, num_steps)

In [None]:
plt.figure(figsize=(10, 8))
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), random_accuracies,
         color="b", label="Random Sampling")
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), lc_accuracies,
         color="g", label="Least Confidence Sampling")
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), ms_accuracies,
         color="r", label="Margin Sampling")
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), es_accuracies,
         color="y", label="Entropy Sampling")
plt.plot(np.arange(seed_size, seed_size + (num_steps + 1) * batch_size, batch_size), longfirst_accuracies,
         color="#000000", label="Long First Sampling")
plt.legend(loc="lower right")
plt.title("Active Learning on DBPedia Classes Dataset")
plt.ylabel('Accuracy')
plt.xlabel('Labeled data')
plt.grid()