In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from fastai import *
from fastai.text import *
from fastai.text.data import *
from fastai.text.all import *

In [49]:
data_path = '../data/final_data.csv'
data = pd.read_csv(data_path)

In [50]:
learning_data = data.drop(data.iloc[ : , 2 : ].loc[ : , data.iloc[ : , 2 : ].sum() < 100].columns, axis=1)
# learning_data = data.drop(data.iloc[ : , 2 : ].columns[data.iloc[ : , 2 : ].apply(lambda col: col.sum() < 100)], axis=1)

In [51]:
train, val = train_test_split(learning_data, test_size = 0.1, random_state=42)

In [64]:
cols_to_label_train = train.columns[2:]
cols_to_label_val = val.columns[2:]

# Labelling function for training
def get_labels(row):
    indices = np.where(row == 1)[0]
    if len(indices) == 0:
        return "empty"
    return "/".join([cols_to_label_train[i] for i in indices])
    
labels_train = train[cols_to_label_train].apply(lambda row: get_labels(row), axis = 1)

train["labels"] = labels_train

labels_val = val[cols_to_label_val].apply(lambda row: get_labels(row), axis = 1)

val['labels'] = labels_val

In [58]:
dls_blk = DataBlock(blocks = (TextBlock.from_df(text_cols = "abstract", seq_len = 100),
                              MultiCategoryBlock),
                    # Unless res_col_name in from_df is overridden, after tokenizing text will always be in 'text' column regardless of where TextBlock is pointing to
                    get_x = ColReader(cols = "text"),
                    get_y = ColReader(cols = "labels", label_delim = "/"),
                    splitter = TrainTestSplitter(test_size = 0.2, random_state = 42))

In [59]:
dls_clf = dls_blk.dataloaders(train, bs = 32, seed = 42)

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `n_workers` has to be changed to 0 to avoid getting stuck
Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck


In [61]:
learn_clf = text_classifier_learner(dls_clf, 
                                    # Specify a model architecture for the learner
                                    AWD_LSTM, 
                                    # Specify the % in dropout layer for regularization
                                    drop_mult=0.5,
                                    # Specify a metric to evaluate performance while training
                                    metrics = accuracy_multi).to_fp16()

In [62]:
# Training a model with 4 epochs and learning rate of 0.005

learn_clf.fine_tune(3, 5e-3, freeze_epochs=1, lr_mult=100)

epoch,train_loss,valid_loss,accuracy_multi,time
0,0.087356,0.082189,0.973078,05:18


epoch,train_loss,valid_loss,accuracy_multi,time
0,0.079777,0.074573,0.975281,12:26
1,0.073641,0.070716,0.975767,12:42
2,0.072162,0.069366,0.976182,12:49


In [148]:
learn_clf.export('../models/abstract_classifier.pkl')

In [141]:
# Setting threshold to 50% confidence

learn_clf.loss_func.thresh = 0.5

In [None]:
preds = val.apply(lambda x: learn_clf.predict(x['abstract'])[0], axis=1)

In [143]:
preds_formatted = preds.to_frame(name='preds')['preds'].apply(lambda x: '/'.join(x))
results = val[['title', 'abstract', 'labels']]
results = results.join(preds_formatted)

In [144]:
def accuracy(labels, preds):
    labels_list = labels.split('/')
    preds_list = preds.split('/')
    s = set(labels_list).intersection(preds_list)
    return (int(len(s) > 0) * 0.8) + (0.05 * (len(s) - 1))

results['accuracy'] = results.apply(lambda x: accuracy(x['labels'], x['preds']), axis=1)

In [145]:
total_accuracy = results['accuracy'].mean()
total_accuracy

0.7889629270656653

In [146]:
import plotly.express as px

In [147]:
fig = px.histogram(results, x="accuracy", nbins=100)
fig.show()