# Retraining XLinear on MeSH tags used by Wellcome

## 1. How many labels are actually used in training?

In [1]:
import json

input_path = "/data/grants_tagger/data/raw/allMeSH_2021.json"


def yield_raw_data(input_path):
    with open(input_path, encoding="latin-1") as f_i:
        f_i.readline()  # skip first line ({"articles":[) which is not valid JSON
        for i, line in enumerate(f_i):
            item = json.loads(line[:-2])
            yield item


input_data = yield_raw_data(input_path)

### number of Mesh terms used in the training set

In [2]:
mesh_in_training = []

for line in input_data:
    mesh_in_training.extend(line["meshMajor"])

In [3]:
mesh_training_labels = list(set(mesh_in_training))

In [4]:
len(mesh_training_labels)

29369

## 2. How many labels are used in Wellcome?

In [17]:
wellcome_labels_path = "../data/processed/WT_mesh_tags_used/tags_used.txt"

In [28]:
with open(wellcome_labels_path, "r") as fp:
    y = fp.readlines()

wellcome_labels = [label.split("\n")[0] for label in y]

In [39]:
print(
    f" there are {len(set(wellcome_labels).intersection(set(mesh_training_labels)))} labels both in training and used by Wellcome"
)
print(f" {len(set(wellcome_labels))} labels are used by Wellcome")

print(
    f" which means {1-len(set(wellcome_labels))/len(mesh_training_labels)} aren't used"
)

 there are 25252 labels both in training and used by Wellcome
 25252 labels are used by Wellcome
 which means 0.14018182437263782 aren't used


## 3. create an csv with terms we would like to keep

In [41]:
pd.DataFrame(wellcome_labels, columns=["DescriptorName"]).to_csv(
    "../data/processed/wt_tags_used.csv"
)

## 1. Train Xlinear model (for fast iteration/experimentation/etc)

This notebook trains xlinear models for fast experimentation. To get the toy data, you need to run dvc from the root of folder:

`dvc pull -s dvc.yaml:preprocess_bioasq_mesh_toy`

Replace `train_mesh2021_toy` and `test_mesh2021_toy` without the suffix (`{train,test}_mesh2021.jsonl`) for a full experience of training (see training times in `results`)

In [2]:
parameters = {
    "ngram_range": (1, 1),
    "beam_size": 30,
    "only_topk": 200,
    "min_weight_value": 0.1,
    "max_features": 400_000,
}

In [1]:
!curl -X POST -H 'Content-type: application/json' --data "{'text': 'Hi <$SLACK_USER>, training has started'}" $SLACK_HOOK

ok

In [3]:
model, label_binarizer = train(
    # uncomment for toy data
    #     train_data_path='../data/processed/train_mesh2021_toy.jsonl',
    #     label_binarizer_path='../models/label_binarizer-toy.pkl',
    #     parameters=parameters,
    #     model_path='../models/xlinear-toy'
    # uncomment for real data
    train_data_path="../data/processed/train_mesh2021.jsonl",
    label_binarizer_path="../models/xlinear/label_binarizer.pkl",
    parameters=parameters,
    model_path="../models/xlinear",
)

../models/xlinear/label_binarizer.pkl exists. Loading existing
Loading data...
Fitting model
Saving model


In [4]:
results, full_report = evaluate(
    # uncomment for toy data
    #     model,
    #     label_binarizer,
    #     train_data_path='../data/processed/train_mesh2021_toy.jsonl',
    #     test_data_path='../data/processed/test_mesh2021_toy.jsonl',
    #     results_path='../results/results_toy.json',
    #     full_report_path='../results/full_report_toy.json'
    # uncomment for full data
    model,
    label_binarizer,
    train_data_path="../data/processed/train_mesh2021.jsonl",
    test_data_path="../data/processed/test_mesh2021.jsonl",
    results_path="../results/results_20220916.json",
    full_report_path="../results/full_report.json",
)

Loading data...
Loading data...
Evaluating model


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
results

{'threshold': '0.50', 'precision': '0.74', 'recall': '0.41', 'f1': '0.53'}

In [6]:
full_report["Coronavirus"]

{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 16}

In [40]:
!curl -X POST -H 'Content-type: application/json' --data "{'text': 'Hi <$SLACK_USER>, training has finished'}" $SLACK_HOOK

ok

In [4]:
!curl -X POST -H 'Content-type: application/json' --data "{'text': 'Hi <$SLACK_USER>, I think I am developing consciousness'}" $SLACK_HOOK

ok