In [None]:
!pip3 install transformers numpy torch sklearn

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 7.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 59.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 55.3 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 7.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 70.8 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from pandas.core.base import NoNewAttributesMixin

class DataLoader():

    def __init__(self, train_path, valid_path = None, test_path = None):
        self.train_path = train_path
        self.valid_path = valid_path
        self.test_path = test_path

    def LoadData(self):
        self.train_raw = pd.read_csv(self.train_path, encoding='latin-1')

        if self.valid_path != None:
            self.valid = pd.read_csv(self.valid_path, encoding='latin-1')

        if self.test_path != None:
            self.test = pd.read_csv(self.test_path, encoding='latin-1')

        return self.train_raw, self.valid, self.test

    def PrintDataShapes(self):
        print("Shape of training data:", self.train_raw.shape)
        print("Shape of validaton data:", self.valid.shape)
        print("Shape of test data:", self.test.shape)

    def GetDFListsPerClass(self):
        self.train_dfs = [self.train_raw.loc[self.train_raw['class']==val, :] for val in self.train_raw['class'].unique()]
        self.valid_dfs = [self.valid.loc[self.valid['class']==val, :] for val in self.valid['class'].unique()]
        self.test_dfs = [self.test.loc[self.test['class']==val, :] for val in self.test['class'].unique()]

        return self.train_dfs, self.valid_dfs, self.test_dfs

In [None]:

import pandas as pd
from pandas.core.base import NoNewAttributesMixin
from sklearn.utils import shuffle
import random
import math
import shutil


class IncrementalDataOrganizer():

    def __init__(self, train, valid, test, train_dfs, valid_dfs, test_dfs, train_strategy = "random"):

        self.train = train
        self.valid = valid
        self.test = test

        self.train_dfs = train_dfs
        self.valid_dfs = valid_dfs
        self.test_dfs = test_dfs
        self.train_strategy = train_strategy

        self.all_classes = self.train["class"].unique().tolist()

        if self.train_strategy == "random":
            self.remaining_classes = self.all_classes

        if self.train_strategy == "large_first":
            self.remaining_classes = self.train["class"].value_counts().index.tolist()

        if self.train_strategy == "small_first":
            self.remaining_classes = self.train["class"].value_counts().index.tolist()
            self.remaining_classes = self.remaining_classes[::-1]

        print("=============== All Class List ===============")
        print(self.remaining_classes)
        print("=============== All Class List ===============")

        self.classes_already_in_training = []
        self.record_indexes_already_in_training = []
        self.weak_classes = []
        self.strong_classes = []
        self.normal_classes = []


    def select_new_training_classes(self, num_classes):


        self.selected_classes = []

        if len(self.remaining_classes) < num_classes:

            self.num_classes = len(self.remaining_classes)
            self.selected_classes =  self.remaining_classes
            self.remaining_classes = []

        else:
            if self.train_strategy == "random":
                self.selected_classes = random.sample(self.remaining_classes, num_classes)

            if  self.train_strategy == "large_first":
                self.selected_classes = self.remaining_classes[:num_classes]

            if self.train_strategy == "small_first":
                self.selected_classes  = self.remaining_classes[:num_classes]


        self.filtered_dfs = [df for df in self.train_dfs if df['class'].iloc[0] in self.selected_classes]

        self.all_record_df = pd.concat(self.filtered_dfs)

        #if first_selection != True:
        self.classes_already_in_training = self.classes_already_in_training + self.selected_classes

        all_record_df = shuffle(self.all_record_df)
        #recordindexes_already_in_training =  recordindexes_already_in_training + all_record_df.index.tolist()

        self.diff= list(set(self.all_classes) - set(self.classes_already_in_training))

        self.remaining_classes = [o for o in self.remaining_classes if o in self.diff]

        return self.selected_classes, self.filtered_dfs, self.all_record_df


    def select_old_training_classes(self,class_list, num_prev_records, ratio = None):


        self.old_filtered_dfs = [df for df in self.train_dfs if df['class'].iloc[0] in class_list]

        self.partial_dfs = []
        for df in self.old_filtered_dfs:
            self.partial_dfs.append(self.get_partial_data(df,num_prev_records))

        self.all__old_record_df = pd.concat(self.partial_dfs)

        self.all__old_record_df = shuffle(self.all__old_record_df)
        #recordindexes_already_in_training =  recordindexes_already_in_training + all_record_df.index.tolist()

        return self.old_filtered_dfs, self.all__old_record_df


    def select_validation_classes(self, records = None, ratio = None):

        self.all_valid_classes = self.valid["class"].unique().tolist()

        self.selected_val_classes  = list(set(self.all_valid_classes) & set(self.classes_already_in_training))

        self.filtered__val_dfs = [df for df in self.valid_dfs if df['class'].iloc[0] in self.selected_val_classes]

        self.all_val_record_df = pd.concat(self.filtered__val_dfs)

        self.all_val_record_df = shuffle(self.all_val_record_df)

        return self.selected_val_classes, self.filtered__val_dfs, self.all_val_record_df

    def get_partial_data(self, dataset, num_prev_records):
        #partial_dataset = dataset.sample(frac = ratio)
        if len(dataset) < num_prev_records:
            self.partial_dataset = dataset
        elif len(dataset) >= num_prev_records:
            self.partial_dataset = dataset.sample(n = num_prev_records)

        return self.partial_dataset



    def GetDataBatches(self, num_classes, num_prev_records, num_base_classes):

        self.number_of_classes = num_classes
        self.num_base_classes = num_base_classes

        base_classes, base_dfs, base_records = self.select_new_training_classes(self.num_base_classes)
        base_classes_val, base_dfs_val, base_records_val = self.select_validation_classes()

        self.training_dfs = [base_records]
        self.Validation_dfs = [base_records_val]

        num_batches = math.ceil((47-num_classes)/num_classes)

        for i in range(num_batches):
            if len(self.remaining_classes) == 0:
                break
            self.new_selected_classes, self.filtered_train_dfs, self.all_filt_train_records_df = self.select_new_training_classes(num_classes)
            self.old_classes = list(set(self.classes_already_in_training) - set(self.new_selected_classes))
            self.filtered_old_train_dfs, self.all_old_filt_train_records_df = self.select_old_training_classes(self.old_classes, num_prev_records)

            self.all_filt_train_records_df = self.all_filt_train_records_df.append(self.all_old_filt_train_records_df)

            self.all_filt_train_records_df = shuffle(self.all_filt_train_records_df)

            self.training_dfs.append(self.all_filt_train_records_df)

            self.classes_for_validation = self.new_selected_classes + self.old_classes
            self.base_classes_val, self.base_dfs_val, self.base_records_val = self.select_validation_classes(self.classes_for_validation)
            self.Validation_dfs.append(self.base_records_val)

            print("New Training Classes:" + str(self.new_selected_classes))
            #print("Old Classes in Training Set:" + str(old_classes))
            #print("Number of Old Classes in Training Set:" + str(len(self.old_classes)))

            print("Unique Classes in New train  Dataframe:" + str(self.all_filt_train_records_df["class"].nunique()))
            print("Unique Classes in New val Dataframe:" + str(self.base_records_val["class"].nunique()))

            #print("Shape of old dataframe:" + str(all_old_filt_train_records_df.shape))
            print("Remaining classes Classes:" + str(self.remaining_classes))
            print("Number of remaining classes:" + str(len(self.remaining_classes)))

            # print("===========================================================================")

        return self.training_dfs, self.Validation_dfs


    def GetSampleData(self, training_dfs, validation_dfs, num_batches, instances_per_batch):

        training_dfs_short = training_dfs[:num_batches]
        validation_dfs_short = validation_dfs[:num_batches]

        self.training_dfs_short = [df.head(n = instances_per_batch) for df in training_dfs_short]
        self.validation_dfs_short = [df.head(n = instances_per_batch) for df in validation_dfs_short]

        return self.training_dfs_short, self.validation_dfs_short


    def GetTrainingBatchesOnly(self, num_classes):

            self.number_of_classes = num_classes
            base_classes, base_dfs, base_records = self.select_new_training_classes(self.number_of_classes)
            base_classes_val, base_dfs_val, base_records_val = self.select_validation_classes()

            self.training_dfs = [base_records]
            self.Validation_dfs = [base_records_val]


            num_batches = math.ceil((47-num_classes)/num_classes)

            self.previous_classes_list = [[]]

            print("Number of Unique Classes in New train  Dataframe:" + str(base_records["class"].nunique()))
            print("Unique Classes in New train  Dataframe:" + str(base_records["class"].unique()))

            print("Number of Unique Classes in New val Dataframe:" + str(base_records_val["class"].nunique()))
            print("Unique Classes in New val Dataframe:" + str(base_records_val["class"].unique()))

            print("Old Training Classes:" + "Nill")

            print("===========================================================================")

            for i in range(num_batches):
                if len(self.remaining_classes) == 0:
                    break

                self.new_selected_classes, self.filtered_train_dfs, self.all_filt_train_records_df = self.select_new_training_classes(num_classes)
                self.training_dfs.append(self.all_filt_train_records_df)

                self.all_training_records_df = pd.concat(self.training_dfs)

                self.classes_for_validation = self.all_training_records_df['class'].unique()
                self.base_classes_val, self.base_dfs_val, self.base_records_val = self.select_validation_classes(self.classes_for_validation)
                self.Validation_dfs.append(self.base_records_val)


                print("Number of Unique Classes in New train  Dataframe:" + str(self.all_filt_train_records_df["class"].nunique()))
                print("Unique Classes in New train  Dataframe:" + str(self.all_filt_train_records_df["class"].unique()))

                print("Number of Unique Classes in New val Dataframe:" + str(self.base_records_val["class"].nunique()))
                print("Unique Classes in New val Dataframe:" + str(self.base_records_val["class"].unique()))


                self.old_classes_only = list(set(self.all_training_records_df['class'].unique()) - set (self.all_filt_train_records_df["class"].unique()))

                self.previous_classes_list.append(self.old_classes_only)
                print("Old Training Classes:" + str(self.previous_classes_list[i+1]))

                print("===========================================================================")

            return self.training_dfs, self.Validation_dfs, self.previous_classes_list



    def GetFixedDataBatches(self, training_dfs, previous_class_list, num_prev_records):

          self.training_dfs = []

          i = 0
          for tdf, prev_class_list in zip(training_dfs, previous_class_list):

              shape = tdf.shape

              if i > 0:
                  self.filtered_old_train_dfs, self.all_old_filt_train_records_df = self.select_old_training_classes(prev_class_list, num_prev_records)
                  self.all_filt_train_records_df = tdf.append(self.all_old_filt_train_records_df)


                  self.all_filt_train_records_df = shuffle(self.all_filt_train_records_df)

                  shape2 = self.all_filt_train_records_df.shape

                  self.training_dfs.append(self.all_filt_train_records_df)

                  print("*******************************")
                  print("*******************************")
                  print("*******************************")
                  print("*******************************")

                  print("Number of Unique Classes in New train  Dataframe:" + str(tdf["class"].nunique()))
                  print("Unique Classes in New train  Dataframe:" + str(tdf["class"].unique()))

                  print("Number of Unique Classes in old dataframe:" + str(self.all_old_filt_train_records_df["class"].nunique()))
                  print("Unique Classes in old dataframe:" + str(self.all_old_filt_train_records_df["class"].unique()))

              else:
                  self.training_dfs.append(tdf)

              i = i + 1



          return self.training_dfs

In [None]:
import torch
from transformers import FlaubertTokenizer, FlaubertModel, FlaubertConfig, FlaubertForSequenceClassification
from sklearn.metrics import accuracy_score
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import os
import numpy as np

class FlaubertDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer):
        self.tokenizer = tokenizer
        self.texts_list = dataset["text"].tolist()
        self.encodings = tokenizer(self.texts_list, truncation=True, padding=True, max_length=512)
        self.labels = dataset["class"].tolist()

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)


class ModelInitializer():

    def __init__(self, max_length):

        #os.environ["WANDB_DISABLED"] = "true"


        self.max_length = max_length
        self.training_args = TrainingArguments(
                                                report_to=None,
                                                output_dir='./results',          # output directory
                                                num_train_epochs = 5,              # total number of training epochs
                                                per_device_train_batch_size=8,   # batch size per device during training
                                                per_device_eval_batch_size=8,
                                                learning_rate = 2e-5,            # batch size for evaluation
                                                warmup_steps=500,                # number of warmup steps for learning rate scheduler
                                                weight_decay=0.01,               # strength of weight decay
                                                #logging_dir='./logs',           # directory for storing logs
                                                load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
                                                # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
                                                #logging_steps=20000,               # log & save weights each logging_steps
                                                #save_steps = 20000,
                                                save_strategy = "epoch",
                                                evaluation_strategy="epoch",     # evaluate each `logging_steps`
                                            )


    @staticmethod
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        # calculate accuracy using sklearn's function
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
        }


    def softmax(self, X, theta = 1.0, axis = None):

        # make X at least 2d
        y = np.atleast_2d(X)

        # find axis
        if axis is None:
            axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

        # multiply y against the theta parameter,
        y = y * float(theta)

        # subtract the max for numerical stability
        y = y - np.expand_dims(np.max(y, axis = axis), axis)

        # exponentiate y
        y = np.exp(y)

        # take the sum along the specified axis
        ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

        # finally: divide elementwise
        p = y / ax_sum

        # flatten if X was 1D
        if len(X.shape) == 1: p = p.flatten()

        return p

    def StartTraining (self,  model_name, model_type, num_labels_p, training_dfs, validation_dfs, instances_from_old_classes):
        predictions_array = []
        labels = []

        self.tokenizer = FlaubertTokenizer.from_pretrained(model_name)
        self.model = FlaubertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels_p).to("cuda")


        print("==========================================")
        print("==========================================")
        print ("Total Batches:", len(training_dfs))
        print("==========================================")
        print("==========================================")

        i = 0

        for train, val in zip(training_dfs, validation_dfs):

            print("==========================================")
            print("==========================================")
            print ("Number of old Instances:", (instances_from_old_classes))
            print ("Processing Batch Number:", (i + 1))
            print ("Total Batches:", len(training_dfs))
            print("==========================================")
            print("==========================================")

            train_dataset = FlaubertDataset(train, self.tokenizer)
            valid_dataset = FlaubertDataset(val, self.tokenizer)

            trainer = Trainer(
                model= self.model,                         # the instantiated Transformers model to be trained
                args= self.training_args,                  # training arguments, defined above
                train_dataset = train_dataset,         # training dataset
                eval_dataset = valid_dataset,          # evaluation dataset
                compute_metrics= ModelInitializer.compute_metrics,     # the callback that computes metrics of interest
            )

            trainer.train()

            print("==============  training complete  ==================")

            pred = trainer.predict(valid_dataset)
            predictions_array.append(pred)
            labels.append(val["class"].tolist())

            print("==============  evaluation complete  ==================")

            predictions = self.softmax(pred.predictions, axis = 1)
            y_pred = list((np.argmax(predictions,axis= 1)))

            y_test = val["class"].tolist()

            from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

            print(accuracy_score(y_test, y_pred))

            i = i + 1

            shutil.rmtree("/content/results")

            print("===========================================")
            print("Checkpoints deleted")
            print("===========================================")


        test_dataset = FlaubertDataset(test, self.tokenizer)
        result = trainer.predict(test_dataset)

        return predictions_array, labels, result

In [None]:
import pandas as pd
import logging


file_train_path = "/content/drive/MyDrive/datasets/text_classification/data_train.csv"
file_test_path = "/content/drive/MyDrive/datasets/text_classification/data_test.csv"
file_val_path = "/content/drive/MyDrive/datasets/text_classification/data_valid.csv"

train_raw = pd.read_csv(file_train_path, encoding='latin-1')
valid = pd.read_csv(file_val_path, encoding='latin-1')
test = pd.read_csv(file_test_path, encoding='latin-1')

In [None]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import FlaubertTokenizer, FlaubertModel, FlaubertConfig, FlaubertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.utils import shuffle, validation
import pdb
import os


file_train_path = "/content/drive/MyDrive/datasets/text_classification/data_train.csv"
file_test_path = "/content/drive/MyDrive/datasets/text_classification/data_test.csv"
file_val_path = "/content/drive/MyDrive/datasets/text_classification/data_valid.csv"


In [None]:
data_loader = DataLoader(file_train_path, file_val_path, file_test_path)

train, valid, test = data_loader.LoadData()
train_dfs, valid_dfs, test_dfs = data_loader.GetDFListsPerClass()


print("************** document loaded ****************")

************** document loaded ****************


In [None]:
import pickle

instance_list = [50]
result_path = "/content/drive/MyDrive/datasets/text_classification/Flaubert models/random_5_5_50_instances"

result_list = []

for instances_from_old_class in instance_list:
    print("==========================================")
    print("==========================================")
    print("==========================================")
    print ("Instances from old class:", instances_from_old_class)
    print("==========================================")
    print("==========================================")
    print("==========================================")


    incr_data_organizer = IncrementalDataOrganizer(train,valid,test,train_dfs,valid_dfs,test_dfs, train_strategy= "small_first")


    classes_per_batch = 5
    num_base_classes = 5
    #instances_from_old_classes = 100


    training_dfs, validation_dfs = incr_data_organizer.GetDataBatches(classes_per_batch, instances_from_old_class, num_base_classes)



    print(len(training_dfs))
    print(training_dfs[0].shape)


    model_name = 'flaubert/flaubert_base_cased'
    num_labels = 47
    max_length = 512

    incremental_model = ModelInitializer( max_length)
    predictions_array, labels, test_results = incremental_model.StartTraining(model_name, "flaubert", num_labels, training_dfs, validation_dfs, instances_from_old_class)

    result = (instances_from_old_class, predictions_array, labels, test_results)
    result_list.append(result)


    outfile = open(result_path,'wb')
    pickle.dump(result_list, outfile)
    outfile.close()

print("Total batches processed:",len(result_list))