# PIP INSTALLATION (if needed)

In [None]:
!pip install transformers
!pip install datasets

In [1]:
!nvidia-smi

Fri Feb 18 20:04:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.86       Driver Version: 470.86       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   48C    P8    19W / 200W |      5MiB /  7982MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!export CUDA_VISIBLE_DEVICES=0
!echo $CUDA_VISIBLE_DEVICES

Remember to restart the runtime after pip install to ensure that the packages will correctly import

# Imports

In [26]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import PreTrainedModel
from transformers.pipelines.pt_utils import KeyDataset

from tqdm.auto import tqdm

from datasets import Dataset, DatasetDict, load_dataset
from datasets import load_metric

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import pandas as pd
import numpy as np
import logging
from glob import glob
from os import path

from IPython.display import HTML, display

import torch

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(device)
device

device(type='cuda', index=0)

# Preprocessing Dataset

In [10]:
dataset_path = r"data/feedback_prize/train.csv"
df = pd.read_csv(dataset_path, header=0, encoding= 'unicode_escape')
df

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...
144288,4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
144289,4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
144290,4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
144291,4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [11]:
df = df.set_index("id")
df = df.rename(columns={"discourse_text": "text"})
df

Unnamed: 0_level_0,discourse_id,discourse_start,discourse_end,text,discourse_type,discourse_type_num,predictionstring
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...
4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [12]:
df["discourse_type"] = pd.Categorical(df["discourse_type"])
df["label"] = df["discourse_type"].cat.codes
df

Unnamed: 0_level_0,discourse_id,discourse_start,discourse_end,text,discourse_type,discourse_type_num,predictionstring,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,4
423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,5
423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,3
423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,3
423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...,0
...,...,...,...,...,...,...,...,...
4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...,3
4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...,3
4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838,5
4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...,3


In [13]:
category_codes = dict(zip(range(df["discourse_type"].cat.categories.size),df["discourse_type"].cat.categories))
category_codes

{0: 'Claim',
 1: 'Concluding Statement',
 2: 'Counterclaim',
 3: 'Evidence',
 4: 'Lead',
 5: 'Position',
 6: 'Rebuttal'}

In [64]:
dataset = Dataset.from_pandas(df[["text", "label"]])

train_test_dataset = dataset.train_test_split(test_size=0.3)
test_validation_dataset = train_test_dataset["test"].train_test_split(test_size=0.333)

train_test_valid_dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_validation_dataset['train'],
    'valid': test_validation_dataset['test']})

train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 101005
    })
    test: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 28873
    })
    valid: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 14415
    })
})

# DOWNSTREAM TRAINING FOR SENTENCE CLASSIFICATION

In [14]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 8
num_labels = len(category_codes)

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [67]:
encoded_dataset = train_test_valid_dataset.map(preprocess_function, batched=True)
columns_to_return = ['input_ids', 'label', 'attention_mask']
encoded_dataset.set_format(type='torch', columns=columns_to_return)
encoded_dataset

  0%|          | 0/102 [00:00<?, ?ba/s]

  0%|          | 0/29 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 101005
    })
    test: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 28873
    })
    valid: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 14415
    })
})

reference: https://huggingface.co/course/chapter3/3?fw=pt \
Also [kappa score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html).
<img src=https://i.kym-cdn.com/photos/images/newsfeed/000/925/494/218.png_large width=50>

In [58]:
def compute_metrics(eval_preds):
    metric_acc = load_metric("accuracy")
    metric_prec = load_metric("precision")
    metric_recall = load_metric("recall")
    metric_f1 = load_metric("f1")
    
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    
    acc = metric_acc.compute(predictions=predictions, references=labels)
    prec = metric_prec.compute(predictions=predictions, references=labels, average="weighted")
    recall = metric_recall.compute(predictions=predictions, references=labels, average="weighted")
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="weighted")
    kappa = cohen_kappa_score(predictions, labels)

    return {"accuracy": acc['accuracy'], "precision": prec['precision'],
            "recall": recall['recall'], "f1": f1['f1'], "kappa": kappa}

In [59]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
model = model.to(device)
model

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/weipyn/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [68]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"models_gitignored/{model_name}-finetuned-sentence-classification",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


We can manually calculate initial unretrained metrics by calling the `compute_metrics` function here and passing in the trainer's predicted values.

In [69]:
predictions = trainer.predict(encoded_dataset["valid"])
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text.
***** Running Prediction *****
  Num examples = 14415
  Batch size = 8


(14415, 7) (14415,)


  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
compute_metrics([predictions.predictions, predictions.label_ids])

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.1593479014915019,
 'precision': 0.18363621978611755,
 'recall': 0.1593479014915019,
 'f1': 0.12390552827433275,
 'kappa': 0.04815549055498458}

We now begin model retraining.

In [71]:
torch.cuda.empty_cache()

In [72]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text.
***** Running training *****
  Num examples = 101005
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 25252


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Kappa
1,0.6413,0.647028,0.782046,0.782754,0.782046,0.779864,0.708201
2,0.519,0.66382,0.78634,0.785192,0.78634,0.784895,0.714995


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text.
***** Running Evaluation *****
  Num examples = 28873
  Batch size = 8
Saving model checkpoint to models_gitignored/distilbert-base-uncased-finetuned-sentence-classification/checkpoint-12626
Configuration saved in models_gitignored/distilbert-base-uncased-finetuned-sentence-classification/checkpoint-12626/config.json
Model weights saved in models_gitignored/distilbert-base-uncased-finetuned-sentence-classification/checkpoint-12626/pytorch_model.bin
tokenizer config file saved in models_gitignored/distilbert-base-uncased-finetuned-sentence-classification/checkpoint-12626/tokenizer_config.json
Special tokens file saved in models_gitignored/distilbert-base-uncased-finetuned-sentence-classification/checkpoint-12626/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilB

TrainOutput(global_step=25252, training_loss=0.6392160841955278, metrics={'train_runtime': 4264.5967, 'train_samples_per_second': 47.369, 'train_steps_per_second': 5.921, 'total_flos': 2.658741421348003e+16, 'train_loss': 0.6392160841955278, 'epoch': 2.0})

Nice, the model is trained now. How does it perform on our validation dataset?

In [75]:
predictions = trainer.predict(encoded_dataset["valid"])
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text.
***** Running Prediction *****
  Num examples = 14415
  Batch size = 8


KeyboardInterrupt: 

In [None]:
compute_metrics([predictions.predictions, predictions.label_ids])

# Loading a Saved Model
Ok let's select a model to load in from our saved checkpoints.

In [17]:
model_path = r"models_gitignored/distilbert-base-uncased-finetuned-sentence-classification/checkpoint-12626"
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path, id2label=category_codes)
loaded_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

Remember to run the code in [Data Retrieval and Generation](./Data%20Retrieval%20and%20Generation.ipynb) if the dataset is not already there.

In [18]:
articles_dataset = load_dataset("text", data_files={"valid":[r"datagen/nytimes/covid-depression-anxiety.txt"]})['valid']

# filter out empty strings
articles_dataset = articles_dataset.filter(lambda example: example['text'])

articles_dataset[1]

Using custom data configuration default-b1b589714371b9df
Reusing dataset text (C:\Users\weipy\.cache\huggingface\datasets\text\default-b1b589714371b9df\0.0.0\d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\weipy\.cache\huggingface\datasets\text\default-b1b589714371b9df\0.0.0\d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4\cache-3d529bd4e71ccc1f.arrow


{'text': 'But can having Covid itself increase the risk of developing mental health problems? A large new study suggests it can.'}

Define the pipeline using the loaded model.

In [22]:
pipe = pipeline("text-classification", model=loaded_model, tokenizer=tokenizer)
pipe

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x1a8ab1417c0>

In [23]:
i = 0
for out in tqdm(pipe(KeyDataset(articles_dataset, "text"))):
    print(articles_dataset[i]['text'], out)
    i+=1

  0%|          | 0/27 [00:00<?, ?it/s]

Social isolation, economic stress, loss of loved ones and other struggles during the pandemic have contributed to rising mental health issues like anxiety and depression. {'label': 'Claim', 'score': 0.5566884279251099}
But can having Covid itself increase the risk of developing mental health problems? A large new study suggests it can. {'label': 'Rebuttal', 'score': 0.8751872181892395}
The study, published Wednesday in the journal The BMJ, analyzed records of nearly 154,000 Covid patients in the Veterans Health Administration system and compared their experience in the year after they recovered from their initial infection with that of a similar group of people who did not contract the virus. {'label': 'Evidence', 'score': 0.7757354974746704}
The study included only patients who had no mental health diagnoses or treatment for at least two years before becoming infected with the coronavirus, allowing researchers to focus on psychiatric diagnoses and treatment that occurred after coronav

In [27]:
lst = []
for i, out in enumerate(tqdm(pipe(KeyDataset(articles_dataset, "text")))):
    #print(articles_dataset[i]['text'], out)
    lst.append((articles_dataset[i]['text'], out))

def showLegend():
    s = "<h2>Legend</h2><br>"
    for label, color in zip(labels, colors):
        s += "<font style=\"background-color: %s;\">%s</font><br>" % (color, label)
    display(HTML(s+"<br/><br/><br/>"))

showLegend()
    
df = pd.DataFrame([{"text":i, **j} for i, j in lst])

labels = ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement', 'Counterclaim', 'Rebuttal']
colors = ["#ACDDDE", "#CAF1DE", "#E1F8DC", "#FEF8DD", "#FFE7C7", "#F7D8BA", "#D6CDEA"]

s = ""
for txt,lbl in zip(df.text.tolist(),df.label.tolist()):
    
    s += "<font style=\"background-color: %s;\">%s </font>" % (colors[labels.index(lbl)], txt)
display(HTML(s+"<br/><br/><br/>"))

  0%|          | 0/27 [00:00<?, ?it/s]