In [None]:
import os
import sys

sys.path.append(os.path.abspath(os.getcwd() + "../../../../"))

## Preprocessing

In [None]:
from ast import literal_eval
from collections import Counter

import pandas as pd
from tqdm.auto import tqdm

def strip_whitespaces(arr):
    return [
        item.replace(' ', '_')
        for item in arr
    ]

def read_dataset(path):
    df = pd.read_csv(path)

    # evaluate lists
    df["pillars_1d"] = df["pillars_1d"].apply(literal_eval)
    df["subpillars_1d"] = df["subpillars_1d"].apply(literal_eval)

    # remove whitespaces
    df["pillars_1d"] = df["pillars_1d"].apply(strip_whitespaces)
    df["subpillars_1d"] = df["subpillars_1d"].apply(strip_whitespaces)

    return df

def get_unique(df, field):
    unique = set()
    occurrences = list()
    for pils in df[field]:
        unique.update(pils)
        occurrences.extend(pils)

    print(Counter(occurrences).most_common())
    return list(unique)

def preprocess_dataset(df, pillars, subpillars):
    # add columns for pillars
    for pillar in pillars:
        df[pillar] = 0

    # add columns for subpillars
    for subpillar in subpillars:
        df[subpillar] = 0

    # add rows for pillars + subpillars 
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):    
        for pillar in row["pillars_1d"]:
            df.loc[idx, pillar] = 1
        for subpillar in row["subpillars_1d"]:
            df.loc[idx, subpillar] = 1
    return df


In [None]:
df_train = read_dataset("../../../data/frameworks_data/data_v0.5/data_v0.5_train.csv")
df_val = read_dataset("../../../data/frameworks_data/data_v0.5/data_v0.5_val.csv")

print(df_train.columns)

pillars = get_unique(df_train, "pillars_1d")
subpillars = get_unique(df_train, "subpillars_1d")

df_train = preprocess_dataset(df_train, pillars, subpillars)
df_val = preprocess_dataset(df_val, pillars, subpillars)

In [None]:
df_train.head()

## Training

In [None]:
import fasttext
import numpy as np
# fasttext.FastText.eprint = lambda x: None # suppress warnings

def prepare_ground_truth(df, subpillars):
    X = df['excerpt'].tolist()
    y = np.stack([df[sp].to_numpy() for sp in subpillars])
    y = np.transpose(y)
    y = [ y[i, :].tolist() for i in range(y.shape[0]) ]
    return X, y

def prepare_fasttext_input(X, y, subpillars, path):
    with open(path, 'w') as f:
        for xi, yi in tqdm(zip(X, y), total=len(X)):
            yi = [f'__label__{subpillars[i]}' for i, label in enumerate(yi) if label == 1]
            f.write(' '.join(yi + [xi]) + '\n')

def train(input, **hparams):
    return fasttext.train_supervised(
        input=path,
        **hparams)

In [None]:
models = []
preds_train, preds_val = [], []
gt_train, gt_val = [], []

hparams = {
    'lr': 0.1,
    'epoch': 5,
    'wordNgrams': 5,
    'bucket': 2000000,
    'dim': 256,
    'loss': 'ova'
}

os.makedirs("ftdata", exist_ok=True)
for pillar in pillars:
    # get associated subpillars 
    sp = [ s for s in subpillars if (pillar + '->') in s ]

    print('Pillar: ', pillar)
    print('Subpillars: ', sp)

    # prepare input to training
    print('Preparing ground truth...')
    X, y = prepare_ground_truth(df_train, sp)

    path = f"ftdata/train_{pillar}.txt"
    if not os.path.exists(path):
        print('Preparing input text file...')
        prepare_fasttext_input(X, y, sp, path)
    else:
        print('Using existing text file...')

    # train model
    print('Training model...')
    model = train(path, **hparams)

    # predictions on train set
    print('Predicting on the train set...')
    preds = model.predict(X, k=-1)
    preds_train.append(preds)

    # match ground truth to model label ordering
    labels = [ label[9:] for label in model.get_labels() ]
    y = np.array(y)
    y_gt = np.zeros_like(y)
    for i, label in enumerate(labels):
        y_gt[:, i] = y[:, sp.index(label)]
    gt_train.append(y_gt)

    # predictions on val set
    print('Predicting on the test set...')
    X, y = prepare_ground_truth(df_val, sp)
    preds = model.predict(X, k=-1)
    preds_val.append(preds)

    # match ground truth to model label ordering
    y = np.array(y)
    y_gt = np.zeros_like(y)
    for i, label in enumerate(labels):
        y_gt[:, i] = y[:, sp.index(label)]
    gt_val.append(y_gt)

    # save the model
    models.append(model)

## Evaluation

In [None]:
def evaluate(gt, pred):
    gt = np.asarray(gt)
    pred = np.asarray(pred) > 0.5

    precision, recall, fscore, support = precision_recall_fscore_support(gt, pred)
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('Fscore: ', fscore)
    print('Support: ', support)
    return precision, recall, fscore, support

In [None]:
from sklearn.metrics import precision_recall_fscore_support

classes = []
metrics_train = {
    'precision': [],
    'recall': [],
    'fscore': [],
    'support': []
}
metrics_val = {
    'precision': [],
    'recall': [],
    'fscore': [],
    'support': []
}

for i, pillar in enumerate(pillars):
    # get associated subpillars 
    sp = [ s for s in subpillars if (pillar + '->') in s ]

    print('Pillar: ', pillar)
    #print('Subpillars: ', sp)
    print("Classes:", models[i].get_labels())
    classes.extend(models[i].get_labels())

    print('Running evaluation on training set...')
    precision, recall, fscore, support = evaluate(gt_train[i], preds_train[i][1])
    metrics_train['precision'].extend(precision)
    metrics_train['recall'].extend(recall)
    metrics_train['fscore'].extend(fscore)
    metrics_train['support'].extend(support)

    print('Running evaluation on validation set...')
    precision, recall, fscore, support = evaluate(gt_val[i], preds_val[i][1])
    metrics_val['precision'].extend(precision)
    metrics_val['recall'].extend(recall)
    metrics_val['fscore'].extend(fscore)
    metrics_val['support'].extend(support)

In [None]:
print('Calculating macro training metrics...')
for metric in metrics_train:
    metrics_train['macro_' + metric] = np.array(metrics_train[metric]).mean()
    print(metric, metrics_train['macro_' + metric])

print()

print('Calculating macro validation metrics...')
for metric in metrics_val:
    metrics_val['macro_' + metric] = np.array(metrics_val[metric]).mean()
    print(metric, metrics_val['macro_' + metric])

## Tracking

In [None]:
import mlflow

from deep.constants import MLFLOW_SERVER

mlflow.set_tracking_uri(MLFLOW_SERVER)
mlflow.set_experiment('fasttext_v0.5_1D')
mlflow.log_params(hparams)

for metric in metrics_train:
    if 'macro' in metric:
        mlflow.log_metric(f'train_{metric}', metrics_train[metric])