In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/Medical Transcription/mtsamples.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [20]:
df['medical_specialty'].value_counts()

 Surgery                          1103
 Consult - History and Phy.        516
 Cardiovascular / Pulmonary        372
 Orthopedic                        355
 Radiology                         273
 General Medicine                  259
 Gastroenterology                  230
 Neurology                         223
 SOAP / Chart / Progress Notes     166
 Obstetrics / Gynecology           160
 Urology                           158
 Discharge Summary                 108
 ENT - Otolaryngology               98
 Neurosurgery                       94
 Hematology - Oncology              90
 Ophthalmology                      83
 Nephrology                         81
 Emergency Room Reports             75
 Pediatrics - Neonatal              70
 Pain Management                    62
 Psychiatry / Psychology            53
 Office Notes                       51
 Podiatry                           47
 Dermatology                        29
 Cosmetic / Plastic Surgery         27
 Dentistry               

In [23]:
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import torch

cuda_available = torch.cuda.is_available()

# get accuracy and prediction matrices
def eval_metrics(actual, pred):
    bal_acc = balanced_accuracy_score(actual,pred)
    f1_sc = f1_score(actual,pred,average="micro")
    return bal_acc, f1_sc

# data preprocess and splitting
def data_loader(filename):
    df = pd.read_csv(filename)
    df.drop(['Unnamed: 0'],axis=1,inplace=True)
    counts = df['medical_specialty'].value_counts()
    # to reduce classes, we only take those classes with more than 20 counts. This number can be varied and it is observed that more the value, better the accuracy
    others = [k for k,v in counts.items() if v<40]
    for each_spec in others:
        df.loc[df['medical_specialty']==each_spec,'medical_specialty']=' others' 
    counts = df['medical_specialty'].value_counts()
    num_classes = len(df['medical_specialty'].unique())
    class_dict = dict(zip(df['medical_specialty'].unique(),list(range(num_classes))))
    df['medical_specialty'] = df['medical_specialty'].apply(lambda x:class_dict[x])
    df.dropna(inplace=True)
    df['transcription'] = df['keywords']+df['transcription'] 
    X = df[['transcription']]
    y = df[["medical_specialty"]]
    train_x,test_x,train_y,test_y = train_test_split(X,y, stratify=y,test_size=0.25)
    class_weights = [1]*num_classes
    return train_x, train_y, test_x, test_y, num_classes, class_weights, class_dict
    
def model_train(num_of_epochs, learning_rate):
    train_x,train_y,test_x,test_y, num_classes, class_weights, class_dict = data_loader("/content/drive/My Drive/Medical Transcription/mtsamples.csv")
    train_df = pd.DataFrame(columns=['transcription','medical_specialty'])
    train_df['transcription'] = train_x['transcription']
    train_df['medical_specialty'] = train_y['medical_specialty']
    test_df = pd.DataFrame(columns=['transcription','medical_specialty'])
    test_df['transcription'] = test_x['transcription']
    test_df['medical_specialty'] = test_y['medical_specialty']
    # learning_rate = 1e-5
    model_args = ClassificationArgs(num_train_epochs=num_of_epochs,learning_rate = learning_rate,  reprocess_input_data= True,save_model_every_epoch=False, overwrite_output_dir= True)
    model = ClassificationModel(
        "roberta",
        "roberta-base",
        num_labels=num_classes,
        weight=class_weights,
        use_cuda=cuda_available,
        args=model_args
        )
    model.train_model(train_df)
    model.save_model()
    result, model_outputs, wrong_predictions = model.eval_model(test_df)
    result,output = model.predict(test_df['transcription'].values.tolist())
    acc, f1 = eval_metrics(test_df['medical_specialty'],result)
    print("Accuracy", acc)  
    print("F1 Score", f1)

In [24]:
model_train(5, 1e-5)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

  0%|          | 0/2923 [00:00<?, ?it/s]



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/366 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/366 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/366 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/366 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/366 [00:00<?, ?it/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/975 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/975 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

Accuracy 0.9661667817468625
F1 Score 0.9794871794871794
