In [22]:
import numpy as np
import pandas as pd

In [23]:
# ! pip install transformers datasets
# ! pip install adapter-transformers
# ! pip install scipy sklearn
# ! pip install evaluate

In [24]:
import json
import os
os.environ["WANDB_DISABLED"] = "true"

In [25]:
df = pd.read_json(r'data\rct_sample_train.jsonl', lines=True)
df = df.drop('metadata', axis=1)

In [26]:
df.head()

Unnamed: 0,label,text
0,CONCLUSIONS,Use of the mobile application was greater than...
1,RESULTS,Between-group effect sizes were 0.78 ( P < .00...
2,CONCLUSIONS,Future investigations on the efficacy and safe...
3,RESULTS,GLP-1 and BNP were infused in incremental dose...
4,RESULTS,Women compared with men had higher ischemic st...


In [27]:
df = pd.read_json(r'data\citation.jsonl', lines=True)
df = df.drop('metadata', axis=1)
df.head()

Unnamed: 0,text,label
0,"Thus , over the past few years , along with ad...",Background
1,"This was done by MERT optimization ( Och , 200...",Uses
2,"She evaluates 3,000 German verbs with a token ...",Background
3,The following four components have been identi...,Background
4,Briscoe and Carroll ( 1997 ) report on manuall...,CompareOrContrast


In [28]:
labels_df = df.label.unique()
print(labels_df)

['Background' 'Uses' 'CompareOrContrast' 'Extends' 'Motivation' 'Future']


In [29]:
from datasets import load_dataset
dataset = load_dataset("json", data_files={"train": "data\\rct_sample_train.jsonl", "test": "data\\rct_sample_test.jsonl"})
dataset["train"].features

Using custom data configuration default-70cf2407a86e0260
Reusing dataset json (C:\Users\Sudeshna_Dash\.cache\huggingface\datasets\json\default-70cf2407a86e0260\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/2 [00:00<?, ?it/s]

{'label': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'metadata': Value(dtype='string', id=None)}

In [30]:
dataset["test"]["label"]

['BACKGROUND',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'CONCLUSIONS',
 'CONCLUSIONS',
 'BACKGROUND',
 'BACKGROUND',
 'METHODS',
 'METHODS',
 'METHODS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'CONCLUSIONS',
 'BACKGROUND',
 'METHODS',
 'METHODS',
 'METHODS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'CONCLUSIONS',
 'BACKGROUND',
 'BACKGROUND',
 'BACKGROUND',
 'METHODS',
 'METHODS',
 'METHODS',
 'METHODS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'CONCLUSIONS',
 'CONCLUSIONS',
 'OBJECTIVE',
 'OBJECTIVE',
 'OBJECTIVE',
 'OBJECTIVE',
 'METHODS',
 'METHODS',
 'METHODS',
 'METHODS',
 'METHODS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'CONCLUSIONS',
 'CONCLUSIONS',
 'OBJECTIVE',
 'METHODS',
 'METHODS',
 'METHODS',
 'METHODS',
 'METHODS',
 'RESULTS',
 'RESULTS',
 'RESULTS',
 'CONCLUSIONS',
 'CONCLUSIONS',
 'OBJECTIVE',
 'METHODS',
 'METHODS',
 'METHODS',
 'METHODS',
 'METHODS',
 'RESULTS',
 'RESULTS',
 'R

In [31]:
dataset['train'] = dataset['train'].class_encode_column("label")

Loading cached processed dataset at C:\Users\Sudeshna_Dash\.cache\huggingface\datasets\json\default-70cf2407a86e0260\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253\cache-bb6bdc14b6da8d8d.arrow
Loading cached processed dataset at C:\Users\Sudeshna_Dash\.cache\huggingface\datasets\json\default-70cf2407a86e0260\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253\cache-553db04cf17d6fd7.arrow


In [32]:
dataset['test'] = dataset['test'].class_encode_column("label")

Loading cached processed dataset at C:\Users\Sudeshna_Dash\.cache\huggingface\datasets\json\default-70cf2407a86e0260\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253\cache-f0832fa7ae2981c1.arrow
Loading cached processed dataset at C:\Users\Sudeshna_Dash\.cache\huggingface\datasets\json\default-70cf2407a86e0260\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253\cache-8008740f613e261a.arrow


In [33]:
dataset['train'].features

{'label': ClassLabel(num_classes=5, names=['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'], id=None),
 'text': Value(dtype='string', id=None),
 'metadata': Value(dtype='string', id=None)}

In [34]:
dataset = dataset.remove_columns(["metadata"])
dataset["train"][0]

{'label': 1,
 'text': 'Use of the mobile application was greater than in a previous trial and was associated with greater sun protection , especially among women .'}

In [35]:
id2labeldict = {}
for i in range(0, labels_df.size):
    id2labeldict[i] = labels_df[i]

# Pretrained Adapter Training

In [36]:
from transformers import AutoTokenizer

In [37]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [38]:
def preprocess_function(batch):
    tokenized = tokenizer(batch["text"], padding = "max_length", truncation=True)
    return tokenized

In [39]:
tokenized_citation = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at C:\Users\Sudeshna_Dash\.cache\huggingface\datasets\json\default-70cf2407a86e0260\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253\cache-ee82c7f3f8c1c9c7.arrow


  0%|          | 0/31 [00:00<?, ?ba/s]

In [40]:
tokenized_citation["train"][0]

{'label': 1,
 'text': 'Use of the mobile application was greater than in a previous trial and was associated with greater sun protection , especially among women .',
 'input_ids': [0,
  34447,
  9,
  5,
  1830,
  2502,
  21,
  2388,
  87,
  11,
  10,
  986,
  1500,
  8,
  21,
  3059,
  19,
  2388,
  3778,
  2591,
  2156,
  941,
  566,
  390,
  479,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,

In [41]:
from transformers import AutoModelWithHeads
model = AutoModelWithHeads.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [42]:
adapter_name = model.load_adapter("AdapterHub/roberta-base-pf-scicite", source="hf")
model.active_adapters = adapter_name

OSError: Windows requires Developer Mode to be activated, or to run Python as an administrator, in order to create symlinks.
In order to activate Developer Mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_accuracy(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": (preds == p.label_ids).mean()}

In [None]:
from transformers import TrainingArguments, Trainer 
training_args = TrainingArguments(output_dir="./results", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=10, weight_decay=0.01,evaluation_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=tokenized_citation["train"],
    eval_dataset=tokenized_citation["test"],
    compute_metrics=compute_accuracy
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)
print(classifier("These results are great for future purpose"))