In [1]:
import numpy as np 
import pandas as pd
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

In [2]:
torch.__version__
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
dataset = pd.read_csv('DepressionDataset/depression_dataset_reddit_cleaned.csv')
dataset.rename(columns = {'is_depression':'label'}, inplace = True)

In [4]:
train_roberta = dataset.sample(frac=0.75)
validation_roberta = dataset.drop(train_roberta.index)
train_roberta['label'].value_counts()

0    2925
1    2873
Name: label, dtype: int64

In [5]:
validation_roberta['label'].value_counts()

0    975
1    958
Name: label, dtype: int64

In [6]:
print(f'ROBERTA Train Size: {train_roberta.shape}')
print(f'ROBERTA Validation Size: {validation_roberta.shape}')

ROBERTA Train Size: (5798, 2)
ROBERTA Validation Size: (1933, 2)


In [7]:
train = Dataset.from_pandas(train_roberta, preserve_index=False)
validation = Dataset.from_pandas(validation_roberta, preserve_index=False)
print(train)
print(validation)

Dataset({
    features: ['clean_text', 'label'],
    num_rows: 5798
})
Dataset({
    features: ['clean_text', 'label'],
    num_rows: 1933
})


In [8]:
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

def tokenize_function(example):
    return tokenizer(example["clean_text"], padding="max_length", truncation=True)

tokenized_train_dataset = train.map(tokenize_function, batched=True)
tokenized_test_dataset = validation.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
tokenized_train_dataset


Dataset({
    features: ['clean_text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 5798
})

In [10]:
tokenized_test_dataset

Dataset({
    features: ['clean_text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1933
})

In [11]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")   # default arguments for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)  # overwriting MLM roberta-base for sequence binary classification

def compute_metrics(eval_preds):   # compute accuracy and f1-score
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(   # specifying trainer class
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()  # starts fine-tuning

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2011,0.119897,0.973616,0.972742
2,0.1202,0.101254,0.978272,0.977707
3,0.0311,0.085106,0.984997,0.984761


Saving model checkpoint to test-trainer\checkpoint-500
Configuration saved in test-trainer\checkpoint-500\config.json
Model weights saved in test-trainer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-500\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-500\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: clean_text. If clean_text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1933
  Batch size = 8
  metric = load_metric("glue", "mrpc")
Saving model checkpoint to test-trainer\checkpoint-1000
Configuration saved in test-trainer\checkpoint-1000\config.json
Model weights saved in test-trainer\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-1000\tokenizer_config

TrainOutput(global_step=2175, training_loss=0.09843295766019274, metrics={'train_runtime': 1931.8047, 'train_samples_per_second': 9.004, 'train_steps_per_second': 1.126, 'total_flos': 4576553696931840.0, 'train_loss': 0.09843295766019274, 'epoch': 3.0})