In [1]:
# Author: Guangya Wan
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support,accuracy_score

In [2]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

In [3]:
task='sentiment'
MODEL = "cffl/bert-base-styleclassification-subjective-neutral"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
OUTPUT_DIR = 'model/'
model = AutoModelForSequenceClassification.from_pretrained(OUTPUT_DIR, num_labels=2)

In [4]:
test_args = TrainingArguments(
    output_dir = OUTPUT_DIR,
    do_train = False,
    do_predict = True,
    dataloader_drop_last = False    
)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
# init trainer
trainer = Trainer(
              model = model, 
              args = test_args, 
              compute_metrics = compute_metrics)

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
torch.version.cuda

'11.7'

# Test Data Set

In [6]:

test_dataset = Dataset.from_pandas(test)
tokenized_test = test_dataset.map(tokenize_function, batched=True)
# small_eval_dataset = tokenized_test.shuffle(seed=42)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
outputs = trainer.predict(tokenized_test,)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, text. If index, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 54
  Batch size = 8


In [8]:
outputs

PredictionOutput(predictions=array([[-0.256801  ,  0.85258865],
       [-2.1723344 ,  2.3142848 ],
       [-1.701721  ,  2.4667428 ],
       [-2.128658  ,  2.2271502 ],
       [ 2.7089434 , -2.6564345 ],
       [-0.17499343,  0.79968244],
       [ 2.3784063 , -2.934221  ],
       [ 2.2547934 , -2.0893302 ],
       [-1.1163455 ,  1.6364365 ],
       [ 1.3339628 , -1.4635692 ],
       [-2.0258543 ,  2.3486369 ],
       [-0.28248513,  0.28417462],
       [ 0.7916322 , -0.7153313 ],
       [ 1.4161899 , -0.9917636 ],
       [-0.31521732,  1.3464507 ],
       [ 0.8659477 , -0.90648746],
       [ 1.7215713 , -1.4797647 ],
       [-1.5413129 ,  1.6693183 ],
       [ 2.740786  , -2.8851473 ],
       [-2.1785998 ,  2.6935947 ],
       [ 2.075655  , -1.8945365 ],
       [-2.0238085 ,  2.4038649 ],
       [-2.1874907 ,  2.2107763 ],
       [ 1.8919607 , -1.7716042 ],
       [ 1.1168371 , -0.9992223 ],
       [-2.151534  ,  2.1534872 ],
       [ 1.2912964 , -1.2051965 ],
       [-2.002926  ,  2.35

In [9]:
from scipy.special import softmax

probabilities = softmax(outputs.predictions, axis=1)

In [10]:
test['probabilities'] = probabilities[:,1]

In [11]:
test.to_csv('test_predictions.csv',index = False)

In [12]:
test['pred'] = (test.probabilities > 0.5).astype(int)

In [13]:
accuracy_score(test['labels'],test['pred'])

1.0

In [14]:
probabilities[test['labels'] == 1]

array([[0.24798466, 0.7520152 ],
       [0.01113329, 0.98886675],
       [0.01524016, 0.9847598 ],
       [0.01266949, 0.98733056],
       [0.2739495 , 0.7260505 ],
       [0.05992973, 0.94007033],
       [0.0124379 , 0.98756194],
       [0.3620079 , 0.637992  ],
       [0.1595382 , 0.8404618 ],
       [0.03876761, 0.9612324 ],
       [0.00759837, 0.99240154],
       [0.01180131, 0.98819864],
       [0.01214922, 0.98785096],
       [0.01332076, 0.98667926],
       [0.01268208, 0.98731774],
       [0.00826896, 0.991731  ],
       [0.01696438, 0.9830357 ],
       [0.05021793, 0.949782  ],
       [0.02517183, 0.9748281 ],
       [0.03723641, 0.96276355],
       [0.01495438, 0.98504555],
       [0.02474443, 0.9752556 ],
       [0.04368299, 0.956317  ],
       [0.01775393, 0.9822459 ],
       [0.07600822, 0.92399186]], dtype=float32)

In [11]:
probabilities[test['labels'] == 0]

array([[0.99095595, 0.0090441 ],
       [0.9935286 , 0.00647133],
       [0.9622714 , 0.03772854],
       [0.8420813 , 0.15791874],
       [0.25179487, 0.74820507],
       [0.64627016, 0.35373   ],
       [0.42599162, 0.57400835],
       [0.8294193 , 0.1705807 ],
       [0.9943116 , 0.00568833],
       [0.91429234, 0.08570758],
       [0.84626204, 0.153738  ],
       [0.7084335 , 0.29156643],
       [0.5827314 , 0.41726863],
       [0.949606  , 0.05039398],
       [0.9932672 , 0.00673287],
       [0.19329505, 0.80670494],
       [0.9829319 , 0.01706796],
       [0.46155322, 0.5384468 ],
       [0.8949669 , 0.10503314],
       [0.87242573, 0.12757427],
       [0.7009775 , 0.29902247],
       [0.43779004, 0.5622099 ],
       [0.6472107 , 0.35278934],
       [0.987899  , 0.01210117],
       [0.8006129 , 0.19938701],
       [0.58786684, 0.4121332 ],
       [0.9413115 , 0.0586886 ],
       [0.957323  , 0.04267703],
       [0.91697055, 0.08302935]], dtype=float32)

In [12]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_score= probabilities[:,1],y_true= test['labels'])

0.9144827586206896

# Unlabelled Data Set

In [5]:
import numpy as np
eval = pd.read_csv('processed_patients.csv')
# eval['id'] = eval['id'].astype(str)

  eval = pd.read_csv('processed_patients.csv')


In [23]:
import re
def cleanup_text(texts):
    text = texts
    if (len(texts) < 20):
        return np.nan
    # Lower case
    text = text.lower()
    # remove newline
    text = re.sub(r'\n', ' ', text)    # remove multiple spaces
    text = re.sub(r' +', ' ', text)
    return text
eval = eval.dropna() # Drop na first
eval = eval.rename(columns={"TEXT": "text"}) # Whatever the text column is convert name to text
eval['text'] = eval['text'].apply(cleanup_text)

In [24]:
eval = eval.dropna() # Drop na which means drop sentence less than 20 char

In [25]:
eval_dataset = Dataset.from_pandas(eval['text'].to_frame().sample(100))

In [17]:
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [26]:
tokenized_eval

Dataset({
    features: ['text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [18]:
outputs = trainer.predict(tokenized_eval,)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 8


In [19]:
outputs

PredictionOutput(predictions=array([[ 2.955892  , -3.521742  ],
       [ 3.3117762 , -3.0167885 ],
       [ 2.7260418 , -2.5441046 ],
       [ 0.65205806, -1.1281118 ],
       [-0.7974271 ,  1.4403678 ],
       [ 1.7743489 , -1.5260421 ],
       [-0.51125383,  0.9007026 ],
       [-1.8720144 ,  1.8484101 ],
       [-1.4168795 ,  1.2892256 ],
       [ 2.6220958 , -3.0318375 ],
       [ 3.0215437 , -3.44546   ],
       [ 2.6685863 , -3.1301293 ],
       [ 2.868391  , -3.481456  ],
       [ 1.3689501 , -0.6396107 ],
       [ 2.482686  , -2.1361067 ],
       [ 2.687508  , -2.6987927 ],
       [ 2.7971416 , -3.1717882 ],
       [ 2.718708  , -2.8110213 ],
       [ 0.7240331 , -0.5909944 ],
       [ 1.6643157 , -1.7611349 ],
       [ 2.9263136 , -3.6171637 ],
       [ 1.7664194 , -2.007171  ],
       [ 2.3827415 , -2.6525214 ],
       [ 3.1204548 , -2.8637335 ],
       [ 3.3703945 , -3.096777  ],
       [ 2.2599864 , -2.1405942 ],
       [ 2.5588884 , -2.4083033 ],
       [ 3.243876  , -3.36

In [20]:
from scipy.special import softmax

probabilities = softmax(outputs.predictions, axis=1)[:,1]

In [22]:
probabilities

array([0.00153508, 0.00178141, 0.00511654, 0.14428216, 0.9035925 ,
       0.03555778, 0.80407435, 0.9763492 , 0.937386  , 0.00349148,
       0.00155146, 0.00302228, 0.00174397, 0.11830702, 0.00976834,
       0.00455801, 0.00255046, 0.00395139, 0.21164675, 0.03150947,
       0.00143741, 0.0224537 , 0.00646245, 0.00251193, 0.0015512 ,
       0.01212148, 0.00691453, 0.00133988, 0.9035925 , 0.84288746,
       0.0396218 , 0.00636354, 0.11830702, 0.00654696, 0.9151365 ,
       0.009054  , 0.01772281, 0.00170369, 0.00193408, 0.00712054,
       0.00244738, 0.73542535, 0.00172854, 0.08501375, 0.00799193,
       0.40275657, 0.49516192, 0.9653341 , 0.9035925 , 0.0151027 ,
       0.002262  , 0.01056018, 0.00566363, 0.04711702, 0.9257666 ,
       0.18240276, 0.02214801, 0.21963249, 0.00142398, 0.0010926 ,
       0.00264703, 0.00699447, 0.5761477 , 0.00541086, 0.09284773,
       0.10019439, 0.09715731, 0.08360453, 0.97096723, 0.04136482,
       0.00427534, 0.02461707, 0.00202606, 0.02033904, 0.02097

In [21]:
eval['subjective_prob'] = probabilities

ValueError: Length of values (100) does not match length of index (2428629)

In [16]:
eval.to_csv('result.csv',index = False)

In [17]:
eval

Unnamed: 0,id,text,subjective_prob
284032,24509.0,the mitral valve leaflets are structurally nor...,0.001372
246211,21298,this flattens the dorsal thecal sac and produc...,0.004339
13506,1095,impression: 1) decreased size of the third ven...,0.930260
69192,5933,is npo and housestaff discussed plan to tube f...,0.993198
77868,6659,takes mdis poorly as well. bronch done this af...,0.696344
...,...,...,...
432991,41446.0,there is no pericardial effusion microbiology:...,0.254323
274753,23726.0,there is mild regional systolic dysfunction wi...,0.004399
148239,12708,ns 250cc bolus ordered. and awaiting response....,0.347187
174471,15013,coronary artery disease: the patient had clean...,0.027851


: 