**The Specter Project**






**Installing the required libraries**

In [1]:
!pip install transformers datasets rich fastlangid wandb

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
Collecting fastlangid
  Downloading fastlangid-1.0.11-py2.py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m77.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.15.5-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m95.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8

**Mounting google drive**

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


**Changing path to the project repository**

In [3]:
%cd /content/drive/MyDrive/Specter-project/MLP_Classification

/content/drive/MyDrive/Specter-project/MLP_Classification


**Import model and tokenizer**

In [4]:
from commons.utils import *
from transformers import AutoTokenizer, AutoModel

# load SPECTER pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
model = AutoModel.from_pretrained('allenai/specter')

Downloading (…)okenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

**Getting data**

In [5]:
!bash commons/getdata.sh
#!gdown --folder https://drive.google.com/drive/folders/14NAz5m6pOUlWq-kf5nGq7LWWSEhJzqOs -O trainedmodels

Retrieving folder list
Retrieving folder 1AM5sXFHQEFN2Cz3lNoTJq_t_Uvb7mLU4 embeddings
Retrieving folder 19_WhCqV61DTxJG-GTOx2zu7qi4HwLNdg mag
Retrieving folder 1xY5IUMjXbod-SPUnYJyPMxQBpz2UTxQ3 mesh
Retrieving folder 1GMz1szQLgOAl4_HBIvUUtnGtoTKoSUdB mag
Processing file 1CRQug0SxE61X3cAQkTJrTv6NvMjISk6K test.csv
Processing file 1Hv3Vde_tDE9auchWYd05cydRMhN3Pg-T train.csv
Processing file 1R0Vt2f0zXvLOXaM1rxB8kTMgqsaJ7a3t val.csv
Retrieving folder 1s9awdgXW0Jg0dXqyBtG74BemIZWQEtrJ mesh
Processing file 1H7uTFOkoA1D59kov9R1lc6dUpHYzChU4 test.csv
Processing file 1UdSH-FYh31pKZnm92mbQ-QrH1DbKVImE train.csv
Processing file 1MXLZt3eLdXAywHAOrYeDlGTNqcyk8LSg val.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1CRQug0SxE61X3cAQkTJrTv6NvMjISk6K
To: /content/drive/MyDrive/Specter-project/MLP_Classification/data/mag/test.csv
100% 163k/163k [00:00<00:00, 74.7MB/s]
Downloading...
From: https:/

**Loading datasets for experiments**

In [6]:
from commons.data_utils import *
# perform data pre-processing
scidocs = load_metadata()
mag = load_dataset(dataset="mag").join(scidocs, how="inner")
mesh = load_dataset(dataset="mesh").join(scidocs, how="inner")
del scidocs


Retrieving non-english papers: 100%|██████████| 37556/37556 [00:10<00:00, 3691.98it/s]


Total number of papers in SciDocs: 48473
Total number of papers after data removing abstract/title lacking papers: 37556
Total number of papers after data removing non english papers: 37227


**Data tokenization**

In [7]:
# takes ~45 seconds
mesh_hf, mag_hf = [
    tokenize_hf(
        hf = to_hf_dataset(dataset=dataset),
        tokenizer=tokenizer
        )
    for dataset in [mesh, mag]
]

# set torch format for the considered data
mesh_hf.set_format("torch")
mag_hf.set_format("torch")

Casting the dataset:   0%|          | 0/23154 [00:00<?, ? examples/s]

Map:   0%|          | 0/23154 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14083 [00:00<?, ? examples/s]

Map:   0%|          | 0/14083 [00:00<?, ? examples/s]

**Preparing the embeddings**

In [8]:
from commons.model_utils import embed_data

do_embed=True
if do_embed:
    # embedding takes approximately 30dd mins
    mesh_embeddings = embed_data(model=model, data=mesh_hf.remove_columns("labels"))
    mag_embeddings = embed_data(model=model, data=mag_hf.remove_columns("labels"))
else:
    # alternatively, read embeddings from data
    mesh_embeddings = torch.from_numpy(np.loadtxt("data/embeddings/mesh/mesh_embeddings.txt"))
    mag_embeddings = torch.from_numpy(np.loadtxt("data/embeddings/mag/mag_embeddings.txt"))

Obtaining embeddings: 100%|██████████| 724/724 [12:53<00:00,  1.07s/it]
Obtaining embeddings: 100%|██████████| 441/441 [07:49<00:00,  1.07s/it]


**Train-test split**

In [9]:
# four config dictionaries (one per classification head considered)
from commons.utils import mesh_config_1, mesh_config_2, mesh_config_3, mesh_config_2bis
mesh_splits = mesh_hf.train_test_split(test_size=mesh_config_1["test_size"])

**Training the model**

In [10]:
from commons.experiment import Experiment
do_track=False
# instantiate an Experiment, when verbose prints out classification head architecture and number of parameters
ch1 = Experiment(config=mesh_config_1, splits=mesh_splits, dataset=mesh_hf, track=do_track)

Classification head architecture:
Sequential(
  (0): Linear(in_features=768, out_features=11, bias=True)
)
Number of parameters (MESH model): 1.0995e+08


In [11]:
from commons.model_utils import embed_data

In [14]:
!pwd

/content/drive/MyDrive/Specter-project/MLP_Classification


In [15]:
train, test = False, True
if train:
    ch1.perform_training()  # might take some time...
else:
    ch1.load_run()

if test: # tests the given configuration
    ch1.test_model()

Model trainedmodels/MESH_CH1.pth loaded successfully!


100%|██████████| 73/73 [01:13<00:00,  1.01s/it]


Average F1-Score 0.9356





In [16]:
# CH2
ch2 = Experiment(config=mesh_config_2, splits=mesh_splits, dataset=mesh_hf, track=do_track)
if train:
    ch2.perform_training()
else:
    ch2.load_run()
if test:
    ch2.test_model()

# CH3
ch3 = Experiment(config=mesh_config_3, splits=mesh_splits, dataset=mesh_hf, track=do_track)
if train:
    ch3.perform_training()
else:
    ch3.load_run()
if test:
    ch3.test_model()


Classification head architecture:
Sequential(
  (0): Linear(in_features=768, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=11, bias=True)
)
Number of parameters (MESH model): 1.0999e+08
Model trainedmodels/MESH_CH2.pth loaded successfully!


100%|██████████| 73/73 [01:15<00:00,  1.03s/it]



Average F1-Score 0.9608
Classification head architecture:
Sequential(
  (0): Linear(in_features=768, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=64, bias=True)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=64, bias=True)
  (7): ReLU()
  (8): Linear(in_features=64, out_features=64, bias=True)
  (9): ReLU()
  (10): Linear(in_features=64, out_features=64, bias=True)
  (11): ReLU()
  (12): Linear(in_features=64, out_features=11, bias=True)
)
Number of parameters (MESH model): 1.1001e+08
Model trainedmodels/MESH_CH3.pth loaded successfully!


100%|██████████| 73/73 [01:14<00:00,  1.02s/it]


Average F1-Score 0.0420





In [17]:
from commons.utils import mag_config_1, mag_config_2, mag_config_3, mag_config_2bis
mag_splits = mag_hf.train_test_split(test_size=mag_config_1["test_size"])

In [19]:
train, test, do_track = False, True, False

# CH1
ch1 = Experiment(config=mag_config_1, splits=mag_splits, dataset=mag_hf, track=do_track)
if train:
    ch1.perform_training()
else:
    ch1.load_run()
if test:
    ch1.test_model()

# CH2
ch2 = Experiment(config=mag_config_2, splits=mag_splits, dataset=mag_hf, track=do_track)
if train:
    ch2.perform_training()
else:
    ch2.load_run()
if test:
    ch2.test_model()

# CH3
ch3 = Experiment(config=mag_config_3, splits=mag_splits, dataset=mag_hf, track=do_track)
if train:
    ch3.perform_training()
else:
    ch3.load_run()
if test:
    ch3.test_model()


Classification head architecture:
Sequential(
  (0): Linear(in_features=768, out_features=19, bias=True)
)
Number of parameters (MAG model): 1.0995e+08
Model trainedmodels/MAG_CH1.pth loaded successfully!


100%|██████████| 45/45 [00:47<00:00,  1.06s/it]



Average F1-Score 0.9448
Classification head architecture:
Sequential(
  (0): Linear(in_features=768, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=19, bias=True)
)
Number of parameters (MAG model): 1.0999e+08
Model trainedmodels/MAG_CH2.pth loaded successfully!


100%|██████████| 45/45 [00:46<00:00,  1.03s/it]



Average F1-Score 0.9379
Classification head architecture:
Sequential(
  (0): Linear(in_features=768, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=64, bias=True)
  (3): ReLU()
  (4): Linear(in_features=64, out_features=64, bias=True)
  (5): ReLU()
  (6): Linear(in_features=64, out_features=64, bias=True)
  (7): ReLU()
  (8): Linear(in_features=64, out_features=64, bias=True)
  (9): ReLU()
  (10): Linear(in_features=64, out_features=64, bias=True)
  (11): ReLU()
  (12): Linear(in_features=64, out_features=19, bias=True)
)
Number of parameters (MAG model): 1.1001e+08
Model trainedmodels/MAG_CH3.pth loaded successfully!


100%|██████████| 45/45 [00:43<00:00,  1.03it/s]


Average F1-Score 0.0105





In [23]:
!zip -r /content/file.zip /content/drive/MyDrive/Specter-project

updating: content/drive/MyDrive/Specter-project/ (stored 0%)
updating: content/drive/MyDrive/Specter-project/setup.py (deflated 33%)
updating: content/drive/MyDrive/Specter-project/README.md (deflated 60%)
updating: content/drive/MyDrive/Specter-project/Dockerfile.cpu (deflated 43%)
updating: content/drive/MyDrive/Specter-project/.gitignore (deflated 46%)
updating: content/drive/MyDrive/Specter-project/requirements.txt (deflated 25%)
updating: content/drive/MyDrive/Specter-project/Spectroid.pdf (deflated 8%)
updating: content/drive/MyDrive/Specter-project/LICENSE (deflated 65%)
updating: content/drive/MyDrive/Specter-project/data/ (stored 0%)
updating: content/drive/MyDrive/Specter-project/data/sample.ids (deflated 45%)
updating: content/drive/MyDrive/Specter-project/data/sample-metadata.json (deflated 62%)
updating: content/drive/MyDrive/Specter-project/data/scibert_scivocab_uncased/ (stored 0%)
updating: content/drive/MyDrive/Specter-project/data/scibert_scivocab_uncased/scibert.tar.