In [11]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

In [2]:
Paths = {
            'Data': './../../Data/',
            'Chia_w_scope': './../../Data/chia_with_scope/',
            'Chia_wo_scope': './../../Data/chia_without_scope/',
        }

In [3]:
def eval_dataframe(df):

    df['Text'] = df['Text'].apply(eval)
    df['Group_Entities'] = df['Group_Entities'].apply(eval)
    df['Relations'] = df['Relations'].apply(eval)
    df['Tokens'] = df['Tokens'].apply(eval)
    df['Entities'] = df['Entities'].apply(eval)

    return df

def get_tags_and_tokens(df):
        
    globalTags = []
    globalTokens = []
    prefixes = ['B-', 'I-']

    for i, f in df.iterrows():
        entities_in_file = f['Entities']
        tokens_in_file = f['Tokens']
        for ent in entities_in_file[0]:
            entity_arr = []
            for e in ent:
                if (any(e.startswith(x) for x in prefixes)) or (e=='O'): entity_arr.append(e)
                else: 
                    new_e = 'B-'+ e
                    entity_arr.append(new_e)                  
            globalTags.append(entity_arr)
        for t in tokens_in_file:
            globalTokens.append(t)
        assert len(globalTokens[-1]) == len(globalTags[-1])
    assert len(globalTokens) == len(globalTags) 

    return globalTokens, globalTags


def create_dataframe(globalTokens, globalTags):
    ner_df = pd.DataFrame()
    ner_df['Tags'] = pd.Series(globalTags)
    ner_df['Sentence'] = pd.Series(globalTokens)

    return ner_df

In [4]:
df = pd.read_csv(Paths['Data'] + 'Chia_w_scope_data.csv')
df = df.drop(columns=df.columns[0], axis=1)

df = eval_dataframe(df)
globalTokens, globalTags = get_tags_and_tokens(df)
ner_df = create_dataframe(globalTokens, globalTags)

In [5]:
ner_df

Unnamed: 0,Tags,Sentence
0,"[B-Person, O, B-Value, I-Value, I-Value, I-Value]","[ages, of, 7, and, 75, years]"
1,"[O, B-Condition, O, O, B-Qualifier, B-Qualifie...","[marked, disability, owing, to, primary, gener..."
2,"[B-Measurement, I-Measurement, O, B-Value, I-V...","[disease, duration, of, at, least, 5, years]"
3,"[B-Temporal, B-Procedure, I-Procedure]","[previous, brain, surgery]"
4,"[B-Condition, I-Condition, B-Value, I-Value, I...","[cognitive, impairment, <, 120, points, on, th..."
...,...,...
12551,"[O, O, O, B-Procedure, I-Procedure, I-Procedur...","[Medically, fit, for, definitive, surgical, ma..."
12552,"[B-Observation, I-Observation, B-Value, I-Valu...","[Life, expectancy, greater, than, one, year]"
12553,"[B-Condition, B-Negation, B-Temporal, I-Tempor...","[Stone, free, after, definitive, surgical, the..."
12554,"[O, O, B-Condition, I-Condition, O, O, O, B-Pr...","[Patients, with, medical, comorbidities, preve..."


In [6]:
checkpoints = [
    'dmis-lab/biobert-v1.1',
    'bert-base-cased',
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext',
    'fidukm34/biobert_v1.1_pubmed-finetuned-ner-finetuned-ner',
    'sciarrilli/biobert-base-cased-v1.2-finetuned-ner',
    'emilyalsentzer/Bio_ClinicalBERT',
    'bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16',
    'algoprog/mimics-tagging-roberta-base',
    'monologg/biobert_v1.1_pubmed'
]

In [9]:
Models = {}
for cp in checkpoints:
    print(cp)
    tokenizer = AutoTokenizer.from_pretrained(cp, do_lower_case = True)
    Models[cp] = {'vocab_size': len(tokenizer.vocab)}
    for j in tqdm(range(len(ner_df))):
        s_example = ner_df.loc[j]['Sentence']
        t_example = ner_df.loc[j]['Tags']
        for i, ex in enumerate(s_example):
            try:
                tokenizer.vocab[ex]
            except:
                tag = t_example[i]
                if tag == 'O':
                    continue
                try:
                    if tag.startswith('B-') or tag.startswith('I-'):
                        Models[cp][tag[2:]] += 1
                    else:
                        Models[cp][tag] += 1
                except:
                    if tag.startswith('B-') or tag.startswith('I-'):
                        Models[cp][tag[2:]] = 1
                    else:
                        Models[cp][tag] = 1

dmis-lab/biobert-v1.1


100%|█████████████████████████████████████| 12556/12556 [20:40<00:00, 10.12it/s]


bert-base-cased


100%|█████████████████████████████████████| 12556/12556 [20:39<00:00, 10.13it/s]


microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext


100%|█████████████████████████████████████| 12556/12556 [22:14<00:00,  9.41it/s]


fidukm34/biobert_v1.1_pubmed-finetuned-ner-finetuned-ner


100%|█████████████████████████████████████| 12556/12556 [20:48<00:00, 10.06it/s]


sciarrilli/biobert-base-cased-v1.2-finetuned-ner


100%|█████████████████████████████████████| 12556/12556 [20:48<00:00, 10.05it/s]


emilyalsentzer/Bio_ClinicalBERT


100%|█████████████████████████████████████| 12556/12556 [20:46<00:00, 10.07it/s]


bionlp/bluebert_pubmed_mimic_uncased_L-24_H-1024_A-16


100%|█████████████████████████████████████| 12556/12556 [22:02<00:00,  9.49it/s]


algoprog/mimics-tagging-roberta-base


100%|█████████████████████████████████████| 12556/12556 [53:51<00:00,  3.89it/s]


monologg/biobert_v1.1_pubmed


100%|█████████████████████████████████████| 12556/12556 [21:01<00:00,  9.96it/s]


In [10]:
Models

{'dmis-lab/biobert-v1.1': {'vocab_size': 28996,
  'Qualifier': 1960,
  'Condition': 10373,
  'Procedure': 2803,
  'Measurement': 3324,
  'Observation': 589,
  'Drug': 3560,
  'Value': 2519,
  'Device': 381,
  'Temporal': 679,
  'Negation': 51,
  'Person': 123},
 'bert-base-cased': {'vocab_size': 28996,
  'Qualifier': 1960,
  'Condition': 10373,
  'Procedure': 2803,
  'Measurement': 3324,
  'Observation': 589,
  'Drug': 3560,
  'Value': 2519,
  'Device': 381,
  'Temporal': 679,
  'Negation': 51,
  'Person': 123},
 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext': {'vocab_size': 30522,
  'Measurement': 3420,
  'Condition': 4948,
  'Procedure': 1340,
  'Person': 636,
  'Qualifier': 1471,
  'Drug': 1941,
  'Value': 2584,
  'Observation': 740,
  'Device': 156,
  'Temporal': 692,
  'Negation': 169},
 'fidukm34/biobert_v1.1_pubmed-finetuned-ner-finetuned-ner': {'vocab_size': 28996,
  'Qualifier': 1960,
  'Condition': 10373,
  'Procedure': 2803,
  'Measurement': 3324,
  'Observa

In [12]:
Models_Extra = {}
cp = 'dmis-lab/biobert-large-cased-v1.1'
tokenizer = AutoTokenizer.from_pretrained(cp, do_lower_case = True)
Models_Extra[cp] = {'vocab_size': len(tokenizer.vocab)}
for j in tqdm(range(len(ner_df))):
    s_example = ner_df.loc[j]['Sentence']
    t_example = ner_df.loc[j]['Tags']
    for i, ex in enumerate(s_example):
        try:
            tokenizer.vocab[ex]
        except:
            tag = t_example[i]
            if tag == 'O':
                continue
            try:
                if tag.startswith('B-') or tag.startswith('I-'):
                    Models_Extra[cp][tag[2:]] += 1
                else:
                    Models_Extra[cp][tag] += 1
            except:
                if tag.startswith('B-') or tag.startswith('I-'):
                    Models_Extra[cp][tag[2:]] = 1
                else:
                    Models_Extra[cp][tag] = 1

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

100%|█████████████████████████████████████| 12556/12556 [54:25<00:00,  3.85it/s]


In [13]:
Models_Extra

{'dmis-lab/biobert-large-cased-v1.1': {'vocab_size': 58996,
  'Condition': 8589,
  'Procedure': 2260,
  'Measurement': 2657,
  'Observation': 558,
  'Qualifier': 1812,
  'Drug': 3220,
  'Value': 1820,
  'Device': 320,
  'Temporal': 631,
  'Negation': 51,
  'Person': 121}}

In [None]:
{'dmis-lab/biobert-v1.1': {'vocab_size': 28996,
  'Qualifier': 1960,
  'Condition': 10373,
  'Procedure': 2803,
  'Measurement': 3324,
  'Observation': 589,
  'Drug': 3560,
  'Value': 2519,
  'Device': 381,
  'Temporal': 679,
  'Negation': 51,
  'Person': 123},