In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric,load_dataset
import evaluate

In [4]:
id2label = {0: "suicide", 1:"non-suicide"}
label2id = {"suicide":0, "non-suicide":1}

In [5]:
tokeniser = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2,label2id=label2id,id2label=id2label)

Downloading config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [6]:
dataset = load_dataset("vibhorag101/phr_suicide_prediction_dataset")

In [7]:
def tokeniseDataset(dataset):
    return(tokeniser(dataset["text"],padding="max_length",truncation=True))

def convertLabel2ID(dataset):
    dataset['label'] = label2id[dataset['label']]
    return dataset
    
dataset = dataset.map(convertLabel2ID) 
tokenisedDataset = dataset.map(tokeniseDataset,batched=True)

trainTokeniseDataset = tokenisedDataset["train"]
testTokenisedDataset= tokenisedDataset["test"]

In [8]:
def compute_metrics(eval_pred):
    metric_acc = evaluate.load("accuracy")
    metric_rec = evaluate.load("recall")
    metric_pre = evaluate.load("precision")
    metric_f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = metric_acc.compute(predictions=predictions, references=labels)
    recall = metric_rec.compute(predictions=predictions, references=labels)
    precision = metric_pre.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}


In [None]:
wandb.login()

In [9]:
training_args = TrainingArguments(
    output_dir="PHR_Suicide_Prediction_Roberta",
    report_to = 'wandb',
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
)

In [10]:
trainer = Trainer(
    model= model,
    args=training_args,
    train_dataset=trainTokeniseDataset,
    eval_dataset=testTokenisedDataset,
    compute_metrics=compute_metrics,
)

In [11]:
# testDataset = testTokenisedDataset.select(range(1000))
# print(testDataset)
predictions = trainer.predict(testTokenisedDataset)
metrics = predictions.metrics
print(metrics)

{'test_loss': 0.16120843589305878, 'test_accuracy': {'accuracy': 0.9659377356458042}, 'test_recall': {'recall': 0.9631481401663722}, 'test_precision': {'precision': 0.9685332871012483}, 'test_f1': {'f1': 0.9658332072698983}, 'test_runtime': 475.1877, 'test_samples_per_second': 97.677, 'test_steps_per_second': 6.105}


In [None]:
trainer.train(resume_from_checkpoint=True)

In [None]:
trainer.save_model("PHR_Suicide_Prediction_Roberta")

In [None]:
trainer.push_to_hub("PHR_Suicide_Prediction_Roberta")
tokeniser.push_to_hub("PHR_Suicide_Prediction_Roberta")

In [None]:
# if above is not working use the moodel push
model.push_to_hub("PHR_Suicide_Prediction_Roberta")
tokeniser.push_to_hub("PHR_Suicide_Prediction_Roberta")