# PIP INSTALLATION (if needed)

In [None]:
!pip install transformers
!pip install datasets

In [None]:
!nvidia-smi

In [None]:
!export CUDA_VISIBLE_DEVICES=0
!echo $CUDA_VISIBLE_DEVICES

Remember to restart the runtime after pip install to ensure that the packages will correctly import

# Imports

In [1]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split

import pandas as pd
import logging
from glob import glob
from os import path

import torch

In [19]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(device)
torch.cuda.is_available()

True

# Preprocessing Dataset

In [2]:
dataset_path = r"../dataset/train.csv"
df = pd.read_csv(dataset_path, header=0, encoding= 'unicode_escape')
df

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...
144288,4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
144289,4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
144290,4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
144291,4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [3]:
df = df.set_index("id")
df

Unnamed: 0_level_0,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...
4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [4]:
df = df.rename(columns={"discourse_text": "text"})
df

Unnamed: 0_level_0,discourse_id,discourse_start,discourse_end,text,discourse_type,discourse_type_num,predictionstring
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...
4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [5]:
train_path = r"drive/MyDrive/Colab Notebooks/datasets/feedback-prize-2021/train/"
def get_essay(id):
  filepath = path.join(train_path, id + ".txt")
  f = open(filepath, 'r')
  content = f.read()
  f.close()
  return content

# get_essay('423A1CA112E2')

In [6]:
df["discourse_type"] = pd.Categorical(df["discourse_type"])
df

Unnamed: 0_level_0,discourse_id,discourse_start,discourse_end,text,discourse_type,discourse_type_num,predictionstring
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...
4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [7]:
df["label"] = df["discourse_type"].cat.codes
df

Unnamed: 0_level_0,discourse_id,discourse_start,discourse_end,text,discourse_type,discourse_type_num,predictionstring,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,4
423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59,5
423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75,3
423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...,3
423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...,0
...,...,...,...,...,...,...,...,...
4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...,3
4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...,3
4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seekÂ multiple opinions instea...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838,5
4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to helpÂ you make ...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...,3


In [8]:
category_codes = dict(zip(range(df["discourse_type"].cat.categories.size),df["discourse_type"].cat.categories))
category_codes

{0: 'Claim',
 1: 'Concluding Statement',
 2: 'Counterclaim',
 3: 'Evidence',
 4: 'Lead',
 5: 'Position',
 6: 'Rebuttal'}

Load features and labels into Dataset object, and perform a 3 way train test validation split, with respective 0.7, 0.2, 0.1 split size.

In [9]:
dataset = Dataset.from_pandas(df[["text", "label"]])

In [10]:
train_test_dataset = dataset.train_test_split(test_size=0.3)
test_validation_dataset = train_test_dataset["test"].train_test_split(test_size=0.333)

train_test_valid_dataset = DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_validation_dataset['train'],
    'valid': test_validation_dataset['test']})

train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 101005
    })
    test: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 28873
    })
    valid: Dataset({
        features: ['text', 'label', 'id'],
        num_rows: 14415
    })
})

In [11]:
train_test_valid_dataset["train"][7]

{'text': 'A friend will be able to talk to you and help you get through a class. Students today have a problem being able to express themselfs, when going through something in your life it could help you get through things, students dont always know how to get through the rough things that are going on in life. kids will release built up emotions if someone knows what they are going through because they will know thatyou need a friend to talk to. Sometimes a kids dosent always know how they are feeling so if a computer is helpful to tell them how they feel they will be able to communicate with other people. to keep kids from bring upset or depressed it will help because an older adult will be able to talk to the kid and get them through all the things that they are going through.',
 'label': 3,
 'id': '218B73B47223'}

# DOWNSTREAM TRAINING FOR SENTENCE CLASSIFICATION

In [12]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 4
num_labels = len(category_codes)

Technically this should be data preprocessing, but considering its tokenisation and tokeneisation is part of the training process, why not

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

In [14]:
encoded_dataset = train_test_valid_dataset.map(preprocess_function, batched=True)
columns_to_return = ['input_ids', 'label', 'attention_mask']
encoded_dataset.set_format(type='torch', columns=columns_to_return)
encoded_dataset

  0%|          | 0/102 [00:00<?, ?ba/s]

  0%|          | 0/29 [00:00<?, ?ba/s]

  0%|          | 0/15 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 101005
    })
    test: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 28873
    })
    valid: Dataset({
        features: ['text', 'label', 'id', 'input_ids', 'attention_mask'],
        num_rows: 14415
    })
})

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
model = model.to(device)
model

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/weipyn/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [21]:
metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-sentence-classification",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
torch.cuda.empty_cache()

In [23]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text.
***** Running training *****
  Num examples = 101005
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 126260


Epoch,Training Loss,Validation Loss
1,0.7484,0.733832


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: id, text.
***** Running Evaluation *****
  Num examples = 28873
  Batch size = 4
Saving model checkpoint to distilbert-base-uncased-finetuned-sentence-classification/checkpoint-25252
Configuration saved in distilbert-base-uncased-finetuned-sentence-classification/checkpoint-25252/config.json
Model weights saved in distilbert-base-uncased-finetuned-sentence-classification/checkpoint-25252/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-sentence-classification/checkpoint-25252/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-sentence-classification/checkpoint-25252/special_tokens_map.json


KeyError: 'eval_accuracy'

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Flatten, Dense, Dropout, Input

import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
X = np.random.rand(1584, 4)
X

In [None]:
y = np.random.choice(4, size=1584)
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

In [None]:
X_train = tf.convert_to_tensor(X_train)
X_test = tf.convert_to_tensor(X_test)
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)

In [None]:
X_train.shape

In [None]:
model = tf.keras.Sequential([
        Flatten(input_shape=(4,)),
        Dense(128, activation="relu"),
        Dropout(.2),
        Dense(4, activation="softmax")
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=["accuracy"])

In [None]:
model.input_shape

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=3)