<a href="https://colab.research.google.com/github/vondersam/sdgs_text_classifier/blob/master/experiments/ulmfit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install iterative-stratification

In [0]:
from fastai.text import *
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, hamming_loss, roc_auc_score
from sklearn.metrics import classification_report
import numpy as np

In [0]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
base_dir = "gdrive/My Drive/fastai-v3/sdgs/"
dataset_dir = base_dir + "lstm/"

## Uploading the data

In [0]:
source_path = Path(dataset_dir + 'cleanup_labelled.csv')
df = pd.read_csv(source_path)
df.labels = df.labels.str.split('|').apply(lambda x: [int(i) for i in x])
df.head()

We first create a general language model with the unlabelled data, which we will later fit into the WikiText 103 language model, so that it has more information about SDGs.

In [0]:
#data_lm = TextLMDataBunch.from_csv(dataset_dir, 'cleanup_unlabelled.csv')

In [0]:
#data_lm.show_batch()
#data_lm.save('data_lm_export.pkl')
path = Path(dataset_dir)
data_lm = load_data(path, 'data_lm_export.pkl')

# Training the general language model

We train on the first layer of the language model with all the data and a pretrained language model from the WikiText 103. This is included in the architecture of AWD_LSTM.

In [0]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)

In [0]:
learn.lr_find()
learn.recorder.plot(skip_end=15)

In [0]:
learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))

In [0]:
#learn.save('fit_head')

In [0]:
learn.load('fit_head')

In [0]:
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))

In [0]:
#learn.save_encoder('fine_tuned_encoder')

In [0]:
learn.load_encoder('fine_tuned_encoder')

In [0]:
learn.predict("African countries like", n_words=20)

# Training the classifier

Then we load the language model for the classifier

In [0]:
def train_classifier(path_, train, test, lm, kfold, process='train'):
  data_clas = TextClasDataBunch.from_df(path_, train_df=train, valid_df=test, vocab=lm.train_ds.vocab, text_cols='text', label_cols='labels', label_delim='|', bs=32)
  acc_02 = partial(accuracy_thresh, thresh=0.2)
  f_score = partial(fbeta, thresh=0.2)

  F1macro = partial(MultiLabelFbeta, average="macro")
  F1micro = partial(MultiLabelFbeta, average="micro")
  F1weighted = partial(MultiLabelFbeta, average="weighted")

  learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5, metrics=[acc_02, f_score], callback_fns=[F1macro, F1micro, F1weighted])
  learn.load_encoder('ft_enc')
  learn.freeze()
  current_file = f'general_model_{kfold}'
  
  if process == 'train':
    # First
    learn.fit_one_cycle(1, 3e-2, moms=(0.8,0.7))

    # Second
    learn.freeze_to(-2)
    learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2), moms=(0.8, 0.7))

    # Third
    learn.freeze_to(-3)
    learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3), moms=(0.8, 0.7))

    # Fourth
    learn.unfreeze()
    learn.fit_one_cycle(10, slice(1e-3/(2.6**4),1e-3), moms=(0.8, 0.7))

    # Fifth
    learn.unfreeze()
    learn.fit_one_cycle(6, slice(1e-3/(2.6**4),1e-3), moms=(0.8, 0.7))
    learn.save(current_file)
  else:
    learn.load(current_file)
    return learn

In [0]:
def merge_dataset(x_train, x_test, y_train, y_test):
  # Merge text and unbinarized labels for train
  train_label = np.array([["|".join(map(str, tr))] for tr in mlb.inverse_transform(y_train)])
  train = pd.DataFrame(np.hstack((x_train,train_label)))
  train.columns = ['text', 'labels']
  
  # Merge text and unbinarized labels for test
  test_label = np.array([["|".join(map(str, tr))] for tr in mlb.inverse_transform(y_test)])
  test = pd.DataFrame(np.hstack((x_test,test_label)))
  test.columns = ['text', 'labels']
  
  return train, test

In [0]:
mskf = MultilabelStratifiedKFold(n_splits=10, random_state=0)
mlb = MultiLabelBinarizer()
models = []
x = df[['text']].values # text
y = mlb.fit_transform(df.labels) # labels
path = Path(dataset_dir)
count = 0

for train_index, test_index in mskf.split(x, y):
   count += 1
   print(f"Fold no. {count}")
   x_train, x_test = x[train_index], x[test_index]
   y_train, y_test = y[train_index], y[test_index]
   train_df, test_df = merge_dataset(x_train, x_test, y_train, y_test)
   model = train_classifier(path, train_df, test_df, data_lm, kfold=count, process='load')
   models.append(model)

In [0]:
def metrics_avg(models, thres=0.3):
  labels_ = list(range(1,18))
  
  def calc(model):
    y_pred, y_true = model.get_preds()
    y_true = y_true.numpy()
    y_pred = y_pred.numpy()
    metrics = classification_report(y_true, y_pred>thres, target_names=labels_, output_dict=True)
    metrics_df = pd.DataFrame.from_dict(metrics)
    h = hamming_loss(y_true, y_pred>thres, labels=labels_)
    roc = roc_auc_score(y_true, y_pred>thres, average='micro')
    return metrics_df, h, roc
  
  metrics_agg, ham, roc = calc(models[0])
  n = len(models)
  for model in models[1:]:
    metrics, h, r = calc(models[0])
    metrics_agg += metrics
    ham += h
    roc += r
  
  return metrics_agg/n, ham/n, roc/n

In [0]:
averaged_results = metrics_avg(models)

In [0]:
averaged_results[2]