In [None]:
import spacy

In [None]:
# download these models

"""

python -m spacy download en_core_web_sm
python -m spacy download es_core_news_sm
python -m spacy download fr_core_news_sm

"""

In [None]:
import nltk
nltk.download("stopwords")

In [None]:
import os
import re
import base64
import io
import json
import requests
import time
import random
import ast
import string
import spacy
import transformers
import string
import fasttext
import pandas as pd
import numpy as np
import torch
import tensorflow as tf

from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from skmultilearn.model_selection import iterative_train_test_split

from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from tqdm import tqdm, trange

In [None]:
COLUMNS = [
    'sectors',
    'pillars_1d',
    'pillars_2d',
    'subpillars_2d',
    'subpillars_1d',
]

ENG_STOP = list(set(stopwords.words('english')))
ES_STOP = list(set(stopwords.words('spanish')))
FR_STOP = list(set(stopwords.words('french')))

stop_words = ENG_STOP + FR_STOP + ES_STOP


nlp_models = {
    "en": spacy.load("en_core_web_sm"),
    "fr": spacy.load("fr_core_news_sm"),
    "es": spacy.load("es_core_news_sm")
}

In [None]:
# load HumSet Dataset

data = pd.read_csv("HumSet/humset_data.csv")

### Data Preparation

In [None]:
def remove_stop(x, stop_words):
    return " ".join([c for c in word_tokenize(x) if c not in stop_words])

def text_preprocess(x, lang):
    text = " ".join([t.lemma_ for t in nlp_models[lang](remove_stop(x, stop_words))
          if not t.is_punct and not t.text.isnumeric()])
    return text.strip().lower()

def pre_process(df):
    
    """
    python type list conversion and cleaning
    adding a clean_excerpt column with cleaned text on stopwords, nltk, and lemmatitazion
    using spacy pre-trained models
    
    """
    
    def clean_and_convert(col):
        if str(col)=="nan":
            col = "[]"
        if col[0]=="[" and col[-1]=="]":
            col = ast.literal_eval(col)
        else:
            col = [col]
        return [a for a in col if a not in ["None", 
                                            "NOT_MAPPED", 
                                            "UNKNOWN", None]]
    
    for c in COLUMNS:
        df[c] = df.apply(lambda x: clean_and_convert(x[c]), axis=1)
        
    df["clean_excerpt"] = df.apply(lambda x: text_preprocess(x["excerpt"], x["lang"]), axis=1)
        
    return df

In [None]:
def iterative_splitting(df):
    
    """
    Based on: http://scikit.ml/ library
    
    80%-10%-10% train, validation, test splitting
    
    """
    
    total, classes = [], []
    lab = preprocessing.MultiLabelBinarizer()
    
    for c in COLUMNS:
        
        fit = lab.fit(df[c])
        total.append(fit.transform(df[c]))
        classes.append(list(fit.classes_))  
        
    X_train, y_train, X_test, y_test = iterative_train_test_split(df.index.to_numpy().reshape(-1, 1), 
                                                                  np.concatenate(total, axis=1), 
                                                                  test_size=0.1)

    X_train, y_train, X_val, y_val = iterative_train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.1)
    
    df_train = df[df.index.isin(X_train.reshape(-1))]
    df_val = df[df.index.isin(X_val.reshape(-1))]
    df_test = df[df.index.isin(X_test.reshape(-1))]
    
    return df_train, df_val, df_test

In [None]:
def lead_splitted_recombination(df, df_train, df_val, df_test):
    
    """
    proportions are manually fine-tuned
    
    """
    
    def recombine_by_lead_id(df1, df2, proportions):
        
        groups = []
        df1_df2_inter = list(set(df1.lead_id).intersection(set(df2.lead_id)))
        df1_pivot = df1[~df1.lead_id.isin(df1_df2_inter)]
        df2_pivot = df2[~df2.lead_id.isin(df1_df2_inter)]

        for i, v in enumerate(df1_df2_inter):
            t = df1[df1.lead_id==df1_df2_inter[i]].index.tolist()
            v = df2[df2.lead_id==df1_df2_inter[i]].index.tolist()
            groups.append(t+v)
            
        random.shuffle(groups)
        index = int((len(groups)*proportions)/100)
        one, two = groups[:index], groups[index:]
        indexes_1 = [c for e in one for c in e]
        indexes_2 = [c for e in two for c in e]

        df1= pd.concat([df[df.index.isin(indexes_1)], df1_pivot])
        df2= pd.concat([df[df.index.isin(indexes_2)], df2_pivot])
    
        return df1, df2
    
    df_train, df_val = recombine_by_lead_id(df_train, df_test, 80)
    df_train, df_test = recombine_by_lead_id(df_train, df_test, 85)
    df_val, df_test = recombine_by_lead_id(df_val, df_test, 75)
    
    assert len(set(df_train.lead_id).intersection(set(df_val.lead_id))) == 0
    assert len(set(df_train.lead_id).intersection(set(df_test.lead_id))) == 0 
    assert len(set(df_val.lead_id).intersection(set(df_test.lead_id))) == 0 
    
    return df_train, df_val, df_test

In [None]:
data = pre_process(data)

In [None]:
df_train, df_val, df_test = iterative_splitting(data)

In [None]:
df_train, df_val, df_text = lead_splitted_recombination(
     data, 
     df_train, 
     df_val, 
     df_test
 )     

In [None]:
df_train.shape, df_val.shape, df_test.shape

### Random Baseline

In [None]:
def get_random_baseline():
    """
    Example for random baseline as described in the paper.
    Saving a csv for each caterogy metrics report.
    df_train hardcoded.
    
    """
    
    if not os.path.exists("./metrics"):
        os.makedirs("./metrics")
    
    for column in COLUMNS:

        mlb = preprocessing.MultiLabelBinarizer()
        mlb.fit(df_train[column])

        x_test = df_train.excerpt.tolist() # we can select a random column, features are ignored
        y_test = mlb.transform(df_train[column])

        dummy_clf = DummyClassifier(
            strategy="stratified"
        )

        dummy_clf.fit(x_test, y_test)
        y_pred = dummy_clf.predict(x_test)

        report = classification_report(y_test, 
                                       y_pred, 
                                       target_names=list(mlb.classes_),
                                       zero_division=False,
                                       output_dict=True)
        
        results = pd.DataFrame(report).transpose()
        results.to_csv(f"./metrics/random_baseline_{column.upper()}.csv")

In [None]:
# get_random_baseline()

### fastText

In [None]:
def prepare_fasttext_data(df, column, filename=None):
    
    """
    we use "clean_exceprt" column as our text to encode.
    Since fasttext it's vocabulary based model, we want a normalized (in lemma),
    and removed stopwords text.
    
    convert dataframe in fasttext format.
    
    """
    
    def clean_newline_sentence(x):
        x = x.replace("\n", " ")
        x = x.translate(str.maketrans(' ', ' ', string.punctuation))
        return x
    
    if not os.path.exists("./fast_data"):
        os.makedirs("./fast_data")
        
    total = []
    text = [c.strip().lower() for c in df.clean_excerpt]
    
    target = [[a.strip().lower().replace(" ", "*") for a in c] 
              if c else ["NEGATIVE"] for c in df[column].tolist()]
    
    for x, y in zip(text, target):
        x = clean_newline_sentence(x)
        labels = " ".join([f"__label__{c}" for c in y])
        total.append(" ".join([labels, x]))
        
    a =  "\n".join(total)
    with open(f"./fast_data/{filename}", "w+") as f:
        f.write(a)
        
        
def get_pred(filename, model, thres = 0.5):
    
    """
    function used to get predictions using resulted fasttext model
    """
    
    tot = []
    test = open(filename, "r").read().split("\n")
    text = [" ".join([c for c in t.split() if "__label__" not in c]) for t in test]

    for s in test:
        labels = [c for c in s.split() if "__label__" in c]
        ss = " ".join([c for c in s.split() if "__label__" not in c]).strip()
        pred = model.predict(ss, k=-1, threshold=thres)
        lab = [c.replace("__label__","").replace("*", " ") for c in pred[0] if not "NEGATIVE" in c]
        tot.append(lab)
        
    return tot

In [None]:
def train_fasttext(return_model=False):
    
    if not os.path.exists("./metrics"):
        os.makedirs("./metrics")
    
    models = {}
    for column in COLUMNS:
        
        prepare_fasttext_data(df_train, column, "fasttextdata.train")
        prepare_fasttext_data(df_val, column, "fasttextdata.val")
        prepare_fasttext_data(df_test, column, "fasttextdata.test")

        model = fasttext.train_supervised(
              input="./fast_data/fasttextdata.train",
              autotuneValidationFile="./fast_data/fasttextdata.val", # automatic hyperparameters tuning provided in fasttext 
              thread=5,
              loss="ova" # one-vs-all loss for multi-label output
          )
        
        if return_model:
            models.update({column: model})

        predictions = get_pred("./fast_data/fasttextdata.test", model)

        mlb = preprocessing.MultiLabelBinarizer()
        mlb.fit(predictions)
        y_pred = mlb.transform(predictions)
        y_test = mlb.transform([[c.lower() for c in a] for a in df_test[column]])

        report = classification_report(y_test, 
                                       y_pred, 
                                       target_names=list(mlb.classes_),
                                       zero_division=False,
                                       output_dict=True)

        results = pd.DataFrame(report).transpose()
        results.to_csv(f"./metrics/fasttext_{column.upper()}.csv")
    
    if return_model: return models

In [None]:
train_fasttext()

### Pre-trained Language Models

In [None]:
# a GPU is needed here

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
# here selection which pre-trained language model to fine-tuned
XTREME_DISTL = "microsoft/xtremedistil-l6-h256-uncased"
XML_ROBERT = "xlm-roberta-base"

In [None]:
def train_transformer(backbone:str = XTREME_DISTL):
    
    # parameters
    epochs = 3
    max_length = 100
    batch_size = 32
    learning_rate = lr=2e-5
 
    
    for column in COLUMNS:
        
        mlb = preprocessing.MultiLabelBinarizer()
        mlb.fit(df_train[column])
        num_labels = len(list(mlb.classes_))
        
        Y_train = pd.DataFrame(mlb.transform(df_train[column]), 
                               columns=list(mlb.classes_))
        Y_val = pd.DataFrame(mlb.transform(df_val[column]), 
                             columns=list(mlb.classes_))
        Y_test = pd.DataFrame(mlb.transform(df_test[column]), 
                              columns=list(mlb.classes_))
        
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            backbone, 
            do_lower_case=True, 
            create_token_type_ids_from_sequences=True
        )

        model = transformers.AutoModelForSequenceClassification.from_pretrained(
            XTREME_DISTL,
            num_labels=num_labels
        )
        model.cuda()
        
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]

        optimizer = transformers.AdamW(
            optimizer_grouped_parameters,
            lr=learning_rate,
            correct_bias=True
        )

        
        encodings = tokenizer.batch_encode_plus(train_data.excerpt.tolist(), 
                                                max_length=max_length,
                                                pad_to_max_length=True, 
                                                truncation=True)
        
        encodings_val = tokenizer.batch_encode_plus(val_data.excerpt.tolist(), 
                                                    max_length=max_length, 
                                                    pad_to_max_length=True, 
                                                    truncation=True)
        input_ids = encodings['input_ids']
        attention_masks = encodings['attention_mask']
        input_ids_test = encodings_val['input_ids']
        attention_masks_test = encodings_val['attention_mask']
        
        train_inputs = torch.tensor(input_ids)
        train_labels = torch.tensor(Y_train.to_numpy())
        train_masks = torch.tensor(attention_masks)

        validation_inputs = torch.tensor(input_ids_test)
        validation_labels = torch.tensor(Y_val.to_numpy())
        validation_masks = torch.tensor(attention_masks_test)
        
        
        train_data = TensorDataset(train_inputs, train_masks, train_labels)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, 
                                      sampler=train_sampler, 
                                      batch_size=batch_size)

        validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        validation_dataloader = DataLoader(validation_data, 
                                           sampler=validation_sampler, 
                                           batch_size=batch_size)
        
        
        train_loss_set = []
        for _ in trange(epochs, desc="Epoch"):

            model.train()
            
            tr_loss = 0 
            nb_tr_examples, nb_tr_steps = 0, 0
            
            for step, batch in enumerate(train_dataloader):

                batch = tuple(t.to(device) for t in batch)
                b_input_ids, b_input_mask, b_labels  = batch
                optimizer.zero_grad()
                
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                
                logits = outputs[0]
                loss = BCEWithLogitsLoss(
                    logits.view(-1,num_labels),
                    b_labels.type_as(logits).view(-1,num_labels)
                )
                
                train_loss_set.append(loss.item())    

                loss.backward()
                optimizer.step()

            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

            print("loss: {}".format(tr_loss/nb_tr_steps))

            model.eval()
            
            logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]
            
            for i, batch in enumerate(validation_dataloader):
                
                batch = tuple(t.to(device) for t in batch)
                b_input_ids, b_input_mask, b_labels = batch
                
                with torch.no_grad():
                    
                    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                    b_logit_pred = outs[0]
                    pred_label = torch.sigmoid(b_logit_pred)

                    b_logit_pred = b_logit_pred.detach().cpu().numpy()
                    pred_label = pred_label.to('cpu').numpy()
                    b_labels = b_labels.to('cpu').numpy()

                tokenized_texts.append(b_input_ids)
                logit_preds.append(b_logit_pred)
                true_labels.append(b_labels)
                pred_labels.append(pred_label)

            pred_labels = [item for sublist in pred_labels for item in sublist]
            true_labels = [item for sublist in true_labels for item in sublist]

            threshold = 0.50
            pred_bools = [pl>threshold for pl in pred_labels]
            true_bools = [tl==1 for tl in true_labels]
            val_f1 = f1_score(true_bools,pred_bools,average='macro')

            print('Macro-Average F1-score: ', val_f1)
        
        
        
        model.eval()
        test_encodings = tokenizer.batch_encode_plus(df_text.excerpt.tolist(),
                                                     max_length=max_length,
                                                     pad_to_max_length=True)
        
        test_input_ids = test_encodings['input_ids']
        test_attention_masks = test_encodings['attention_mask']
        
        test_inputs = torch.tensor(test_input_ids)
        test_labels = torch.tensor(Y_test.to_numpy())
        test_masks = torch.tensor(test_attention_masks)

        test_data = TensorDataset(test_inputs, test_masks, test_labels)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data, 
                                     sampler=test_sampler, 
                                     batch_size=batch_size)

        logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

        for i, batch in enumerate(test_dataloader):
            
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            
            with torch.no_grad():
                
                outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
                
                b_logit_pred = outs[0]
                pred_label = torch.sigmoid(b_logit_pred)

                b_logit_pred = b_logit_pred.detach().cpu().numpy()
                pred_label = pred_label.to('cpu').numpy()
                b_labels = b_labels.to('cpu').numpy()

            tokenized_texts.append(b_input_ids)
            logit_preds.append(b_logit_pred)
            true_labels.append(b_labels)
            pred_labels.append(pred_label)

        tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
        pred_labels = [item for sublist in pred_labels for item in sublist]
        true_labels = [item for sublist in true_labels for item in sublist]
        
        y_test = [tl==1 for tl in true_labels]
        y_pred = [pl>0.50 for pl in pred_labels]
        
        report = classification_report(y_test, 
                                       y_pred, 
                                       target_names=list(mlb.classes_),
                                       zero_division=False,
                                       output_dict=True)

        results = pd.DataFrame(report).transpose()
        results.to_csv(f"./metrics/{backbone}_{column.upper()}.csv")

In [None]:
train_transformer()