In [19]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, \
    DataCollatorWithPadding, EarlyStoppingCallback
from pathlib import Path
import numpy as np
import evaluate
import torch
from pynvml import *
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import classification_report
import random
import argparse
import sys
import shutil
import pandas as pd

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()


def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True, max_length=128)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    scmetrics.add_batch(predictions=predictions, references=labels)
    return scmetrics.compute()


def create_labels(sentiment):
    labels = []
    for s in sentiment:
        if s == 'neutral':
            labels += [0]
        elif s == 'negative':
            labels += [1]
        else:
            labels += [2]
    return labels

In [20]:
# parser = argparse.ArgumentParser(description='Sentence classification task')
# parser.add_argument('--model', help='Path to pt model and tokenizer')
# config = parser.parse_args(sys.argv[1:])
task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}" #minerva: download model from hugging face and put in folder, update to path

# set seed
random.seed(42)
np.random.seed(42)

# Create task Dataset from annotated samples
sentences = pd.read_csv('sentiment_language.csv', header=0)
sentences = sentences[['Language', "Alissa's label"]]
dataset = Dataset.from_pandas(sentences).rename_columns({'Language': 'sentence', "Alissa's label": 'sentiment'})
dataset = dataset.add_column('label', create_labels(dataset['sentiment']))
label_dt = dataset.train_test_split(0.2)

print(label_dt)

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tkn_dt = label_dt.map(tokenize_function, batched=True, num_proc=4) # batched tokenizing activated
# tkn_dt = tkn_dt.remove_columns(['']) # at some point we might need to delete sentiment column or else get an error

# data loader = allows us to use a chunk of the data at a time while training (or else computer crashes)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # read the sentences and pad them to create equal length vectors

DatasetDict({
    train: Dataset({
        features: ['sentence', 'sentiment', 'label'],
        num_rows: 88
    })
    test: Dataset({
        features: ['sentence', 'sentiment', 'label'],
        num_rows: 22
    })
})


Map (num_proc=4):   0%|          | 0/88 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/22 [00:00<?, ? examples/s]

In [7]:
# set seed
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL, num_labels=3)
if torch.cuda.is_available():
    model.to('cuda') # put the model on the gpu once, and then add a batch everytime when doing a training or evaluation loop
    print_gpu_utilization()

In [12]:
# Hyperparameters (for best configuration selection)
# 1st evaluate the hyper parameters once at a time, and select each hyper parameter that gives you the best score on results aka F1 vs recall
params = {
    'batch_size': [2], # [2, 4, 8],
    'epochs': [2], # [1, 2, 5],
    'learning_rate': [2e-5], # [5e-6, 1e-5, 2e-5, 5e-5, 1e-4],
    'weight_decay': [0.01, 0.1], # [0, 0.01, 0.1], # how much the weight change is shrinking
    'warmup_ratio': [0, 0.1], # [0, 0.01, 0.1] # ratio of examples it takes to get ready for the learning rate
}

metrics_file = f'classification_metrics_1run.csv'
if os.path.isfile(metrics_file):
    f = open(metrics_file, 'a')
else:
    f = open(metrics_file, 'w')
    f.write('batch_size,epochs,learning_rate,weight_decay,warmup_ratio,loss,f1,precision,recall\n')

best_model = []
# best_precision = 0.0
best_f1 = 0.0
tmp_trainer, tmp_comb = None, None
for comb in list(ParameterGrid(params)):
    print(f"Parameters: {comb}")
    training_args = TrainingArguments(
        output_dir=f'runs',
        evaluation_strategy='epoch', # every epoch the model is evaluated and checkpoint is made saving the weights
        eval_steps=1, # check, probably each step is by epoch
        logging_strategy='epoch',
        weight_decay=comb['weight_decay'],
        warmup_ratio=comb['warmup_ratio'],
        num_train_epochs=comb['epochs'],
        learning_rate=comb['learning_rate'],
        per_device_train_batch_size=comb['batch_size'],
        per_device_eval_batch_size=comb['batch_size'],
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='eval_f1',
        seed=42)
    scmetrics = evaluate.load("scmetrics")

    trainer = Trainer(model=model,
                      args=training_args,
                      callbacks=[EarlyStoppingCallback(early_stopping_patience=10)], # if loss isnt decreasing for 2 epochs then it stops training
                      train_dataset=tkn_dt['train'],
                      eval_dataset=tkn_dt['test'],
                      compute_metrics=compute_metrics,
                      data_collator=data_collator)
    results = trainer.train()
    results_eval = trainer.evaluate()

    v = [comb['batch_size'], comb['epochs'], comb['learning_rate'], comb['weight_decay'], comb['warmup_ratio'],
    results.metrics['train_loss'], results_eval['eval_f1'], results_eval['eval_precision'], results_eval['eval_recall']]
    f.write(','.join([str(el) for el in v]) + '\n')

    if results_eval['eval_f1'] > best_f1:
        best_f1 = results_eval['eval_f1']
        tmp_trainer = trainer
        tmp_comb = comb
    print('-' * 100)
    print('\n\n')

# Error analysis step
labels_to_sen = {0: 'neutral', 1: 'negative', 2: 'positive'}
if tmp_trainer is not None:
    best_trainer = tmp_trainer
    best_comb = tmp_comb
    print(f'Best parameters configuration: {best_comb}')
    dev_pred = best_trainer.predict(tkn_dt['test'])
    pred = np.argmax(dev_pred.predictions, axis=-1)
    pred_score = np.max(torch.nn.functional.softmax(torch.tensor(dev_pred.predictions), dim=-1).numpy(), axis=-1)
    i = 0
    errors = {'FP': [], 'FN': []}
    for pred_lab, true_lab in zip(pred, dev_pred.label_ids):
        if pred_lab != true_lab:
            if pred_lab > 1:
                errors['FP'].append((
                    tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tkn_dt['test']['input_ids'][i])),
                    pred_score[i], labels_to_sen[pred_lab], labels_to_sen[true_lab]))
            else:
                errors['FN'].append((tokenizer.convert_tokens_to_string(
                    tokenizer.convert_ids_to_tokens(tkn_dt['test']['input_ids'][i])), pred_score[i],
                                     labels_to_sen[pred_lab], labels_to_sen[true_lab]))
        i += 1
    with open(f'error_analysis_v2.tsv',
              'w') as f:
        f.write('sentence\tpredicted_label\ttrue_label\tprobability\n')
        for k, vect in errors.items():
            if k == 'FP':
                for sen in vect:
                    f.write(sen[0] + '\t' + f'PRED_{sen[2].upper()}' + '\t' + f'TRUE_{sen[3].upper()}' + '\t' + str(
                        sen[1]) + '\n')
                f.write('\n')
            else:
                for sen in vect:
                    f.write(sen[0] + '\t' + f'PRED_{sen[2].upper()}' + '\t' + f'TRUE_{sen[3].upper()}' + '\t' + str(
                        sen[1]) + '\n')
    test_pred = best_trainer.predict(tkn_dt['test'])
    print(test_pred.metrics)

    model_dir = f'runs'
    for d in os.listdir(model_dir):
        # This removes the checkpoints (comment it if you want to keep them)
        if 'checkpoint' in d:
            shutil.rmtree(os.path.join(model_dir, d))
    best_trainer.save_model(
        output_dir=f'best_model')
else:
    print("Precision is 0.0 change something in your model's configuration and retry.")
f.close()

#calculate F1 score for each group of labeled sentences i.e. 0 vs 1 vs 2

Parameters: {'batch_size': 2, 'epochs': 2, 'learning_rate': 2e-05, 'warmup_ratio': 0, 'weight_decay': 0.01}


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.5662,0.701039,0.849084,0.816774,0.884615
2,0.6205,0.601459,0.849084,0.816774,0.884615


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.89      0.94      0.91        17
           2       0.88      1.00      0.93         7

    accuracy                           0.88        26
   macro avg       0.59      0.65      0.62        26
weighted avg       0.82      0.88      0.85        26



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.89      0.94      0.91        17
           2       0.88      1.00      0.93         7

    accuracy                           0.88        26
   macro avg       0.59      0.65      0.62        26
weighted avg       0.82      0.88      0.85        26



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.89      0.94      0.91        17
           2       0.88      1.00      0.93         7

    accuracy                           0.88        26
   macro avg       0.59      0.65      0.62        26
weighted avg       0.82      0.88      0.85        26

----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 2, 'epochs': 2, 'learning_rate': 2e-05, 'warmup_ratio': 0, 'weight_decay': 0.1}


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3283,0.8088,0.850962,0.824786,0.884615
2,0.2193,0.739572,0.829983,0.822382,0.846154


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.94      0.94      0.94        17
           2       0.78      1.00      0.88         7

    accuracy                           0.88        26
   macro avg       0.57      0.65      0.61        26
weighted avg       0.82      0.88      0.85        26



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.94      0.88      0.91        17
           2       0.78      1.00      0.88         7

    accuracy                           0.85        26
   macro avg       0.57      0.63      0.59        26
weighted avg       0.82      0.85      0.83        26



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.94      0.94      0.94        17
           2       0.78      1.00      0.88         7

    accuracy                           0.88        26
   macro avg       0.57      0.65      0.61        26
weighted avg       0.82      0.88      0.85        26

----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 2, 'epochs': 2, 'learning_rate': 2e-05, 'warmup_ratio': 0.1, 'weight_decay': 0.01}


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.1654,1.032144,0.828205,0.8125,0.846154
2,0.0426,0.948628,0.829983,0.822382,0.846154


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      0.88      0.88        17
           2       0.88      1.00      0.93         7

    accuracy                           0.85        26
   macro avg       0.59      0.63      0.61        26
weighted avg       0.81      0.85      0.83        26

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.94      0.88      0.91        17
           2       0.78      1.00      0.88         7

    accuracy                           0.85        26
   macro avg       0.57      0.63      0.59        26
weighted avg       0.82      0.85      0.83        26



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.94      0.88      0.91        17
           2       0.78      1.00      0.88         7

    accuracy                           0.85        26
   macro avg       0.57      0.63      0.59        26
weighted avg       0.82      0.85      0.83        26

----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 2, 'epochs': 2, 'learning_rate': 2e-05, 'warmup_ratio': 0.1, 'weight_decay': 0.1}




Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.0731,1.378657,0.828205,0.8125,0.846154
2,0.117,1.119966,0.824009,0.841346,0.807692


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      0.88      0.88        17
           2       0.88      1.00      0.93         7

    accuracy                           0.85        26
   macro avg       0.59      0.63      0.61        26
weighted avg       0.81      0.85      0.83        26

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      0.82      0.85        17
           2       1.00      1.00      1.00         7

    accuracy                           0.81        26
   macro avg       0.62      0.61      0.62        26
weighted avg       0.84      0.81      0.82        26



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      0.88      0.88        17
           2       0.88      1.00      0.93         7

    accuracy                           0.85        26
   macro avg       0.59      0.63      0.61        26
weighted avg       0.81      0.85      0.83        26

----------------------------------------------------------------------------------------------------



Best parameters configuration: {'batch_size': 2, 'epochs': 2, 'learning_rate': 2e-05, 'warmup_ratio': 0, 'weight_decay': 0.1}
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      0.88      0.88        17
           2       0.88      1.00      0.93         7

    accuracy                           0.85        26
   macro avg       0.59      0.63      0.61        26
weighted avg       0.81      0.85      0.83        26



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.88      0.88      0.88        17
           2       0.88      1.00      0.93         7

    accuracy                           0.85        26
   macro avg       0.59      0.63      0.61        26
weighted avg       0.81      0.85      0.83        26

{'test_loss': 1.3786574602127075, 'test_f1': 0.8282051282051281, 'test_precision': 0.8125, 'test_recall': 0.8461538461538461, 'test_runtime': 2.1223, 'test_samples_per_second': 12.251, 'test_steps_per_second': 6.125}


In [13]:
from transformers import pipeline
sentiment_task = pipeline("sentiment-analysis",
                          model = AutoModelForSequenceClassification.from_pretrained("best_model", num_labels=3),
                          tokenizer = AutoTokenizer.from_pretrained(MODEL))

df = pd.read_csv('sentiment_language.csv')
data = list(df['Language'].astype(str))
sentiment_results = sentiment_task(data)
df_results = pd.DataFrame(sentiment_results)
final = pd.concat([df, df_results], axis = 1)
print(final)

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


           Words                                           Language  \
0        Adamant  When told that her urine was positive for coca...   
1    Cooperative  Pt was calm and cooperated with nursing care o...   
2      Compliant  She says that the patient has been compliant w...   
3       Agitated  states he was always quite sweet, not agitated...   
4       Agitated  However, information from previous shift is th...   
..           ...                                                ...   
123          NaN  She has not been taking iron because it makes ...   
124          NaN  She is a song writer and also sings. She has a...   
125          NaN  She enjoys walking with her fiance and her dog...   
126          NaN  He does not want to add a medication so I will...   
127          NaN  She stated that even if it was positive, she w...   

        Note ID        MRN Alissa's label    label     score  
0    67625917.0  1796281.0       negative  LABEL_1  0.904331  
1    57757871.0  6268

In [14]:
final.to_csv('sentiment_results_park.csv', encoding = 'utf-8-sig') 

In [17]:
note1 = "ED Psych Progress Note  Received sign out from day ED team. Pt with h/o bipolar disorder was BIB GM after pt repeatedly tried to run away from home, has not been compliant with treatment and has decompensated psychiatrically..  Has appeared oddly related with pressured speech and delusional thought content.  Tonight pt repeatedly asked to be seen, said she really thinks that she is pregnant and believes that she is ovulating, asking for pre-natal vitamin.  She continues to have pressured speech and at times talked about suing multiple people, and said she has proved to the supreme court that she is not psychiatrically ill.  She is fidgety but does not have any psychomotor agitation currently.    Pt is in need of admission but no adolescent bed available tonight.  Will continue to observe pt in ED and admit vs transfer in AM pending bed availability."
note2 = "ED Psych Progress Note  Pt slept overnight. Per Dr. Han's note pt was not expressing any SI or HI and admitted she wanted simply to sleep. Pt abusing crack cocaine and said she had not slept in 2 days. Pt awoken this AM, given food which she threw on the floor. Pt refusing to get dressed, stating she would leave ""on (her own time."" Shouting and cursing at resident MD and security staff. Pt requiring security to re-direct her to get dressed. Pt finally agreeing to get dressed and is escorted out of the ER with security.    Impression: Cocaine intoxication and dependence. No acute suicidal or homicidal ideas and would not benefit from inpatient psychiatric admission and is not seeking admission.    Plan: Discharge. Will provide with list of referral for walk-in clinics, shelters and substance treatment programs."
note1_split = note1.split('.')
note1_split = [x for x in note1_split if x != '']
note1_sentiment = sentiment_task(note1_split)
note1_results = pd.DataFrame(note1_sentiment)

note2_split = note2.split('.')
note2_split = [x for x in note2_split if x != '']
note2_sentiment = sentiment_task(note2_split)
note2_results = pd.DataFrame(note2_sentiment)

In [18]:
print(note1_split, '\n', 
      note1_results, '\n',
      note2_split, '\n', 
      note2_results)

['ED Psych Progress Note  Received sign out from day ED team', ' Pt with h/o bipolar disorder was BIB GM after pt repeatedly tried to run away from home, has not been compliant with treatment and has decompensated psychiatrically', '  Has appeared oddly related with pressured speech and delusional thought content', '  Tonight pt repeatedly asked to be seen, said she really thinks that she is pregnant and believes that she is ovulating, asking for pre-natal vitamin', '  She continues to have pressured speech and at times talked about suing multiple people, and said she has proved to the supreme court that she is not psychiatrically ill', '  She is fidgety but does not have any psychomotor agitation currently', '    Pt is in need of admission but no adolescent bed available tonight', '  Will continue to observe pt in ED and admit vs transfer in AM pending bed availability'] 
      label     score
0  LABEL_1  0.999967
1  LABEL_1  0.999970
2  LABEL_1  0.999962
3  LABEL_1  0.999970
4  LABEL