In [7]:
import os
import time
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from datasets import load_dataset
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate


id2label = {
    0: "O",
    1: "B-name",
    2: "I-name",
    3: "B-redshift",
    4: "I-redshift",
    5: "B-RA",
    6: "I-RA",
    7: "B-DEC",
    8: "I-DEC",
    9: "B-Type",
    10: "I-Type",
}
label2id = {"O": 0,
          "B-name": 1,
          "I-name": 2,
          "B-redshift": 3,
          "I-redshift": 4,
          "B-RA": 5,
          "I-RA": 6,
          "B-DEC": 7,
          "I-DEC": 8,
          "B-Type": 9,
          "I-Type": 10,
         }


def get_html_text(f,plength=100):
    with open(f) as file:
        soup = BeautifulSoup(file, 'html.parser')
        
    #to inspect html and identify the class label
    #print(soup.prettify()) 
    sections = soup.find_all('div', class_="article-text")

    # Extracting all paragraphs in the section
    paragraphs = soup.find_all('p')
    text = ''
    for i, para in enumerate(paragraphs):
        p = para.get_text()
        if (len(p)>plength) and (p[0].isalpha()):
            text+=p
            #print(f"Paragraph {i+1}:", p)
            #print('--------------')
    #text = re.sub(r'[^a-zA-Z0-9 .,]', '', text)#.lower()
    return text
    

def format_pred_for_print(pred, paragraph, conf = 0.99):
    '''
    returns a pretty string with the predictions in paragraph highlighted.
    pred: prediction output from a pipeline
    paragraph: the original text the predictions were made on
    '''
    
    RED_START = '\x1b[31m'
    RED_END = '\x1b[0m'
    
    formatted_string=''
    end=0
    
    for entry in pred:
        if entry['score']>conf:
            start = entry['start']
            # add what's in between
            formatted_string += paragraph[end:start]
            # add the entry
            end = entry['end']
            label = entry['entity']
            score = ' {:.2f}'.format(entry['score'])
            formatted_string+= RED_START+'['+paragraph[start:end]+' ('+label+score+')]'+RED_END
        
    formatted_string+= paragraph[end:]
    return(formatted_string)

def extract_galaxy_names(sentence, predictions, confidence_level):
    galaxy_names = []
    current_name = ""
    current_score = 0.0

    for prediction in predictions:
        entity = prediction['entity']
        word = prediction['word']
        score = prediction['score']

        if entity == 'B-name':
            # Check if the current name meets the confidence level and add it to the list
            if current_name and current_score >= confidence_level:
                galaxy_names.append(current_name)

            # Start a new galaxy name and reset current score
            current_name = word
            current_score = score

        elif entity == 'I-name' and current_name:
            # Continue building the current galaxy name
            current_name += word

    # Add the last found name if it meets the confidence level
    if current_name and current_score >= confidence_level:
        galaxy_names.append(current_name)

    return galaxy_names

def names_in_paper(htmlfilepath):
    texts = get_html_text(htmlfilepath)
    sentences = sent_tokenize(texts)
    galaxy_names = []
    for s in sentences:
        pred = nlpeft(s)
        ex = extract_galaxy_names(s, pred,confidence_level=0.99)
        for e in ex:
            galname = re.sub(r'[^a-zA-Z0-9]', '', e)
            if len(galname)>1:
                galaxy_names.append(galname)
    return list(set(galaxy_names))


import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning)

peft_model_id = 'NER-BERT-lora-token-classification/nov22//'
config = PeftConfig.from_pretrained(peft_model_id)
tokenizerpeft = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
inference_model = AutoModelForTokenClassification.from_pretrained(config.base_model_name_or_path, num_labels=11, id2label=id2label, label2id=label2id,ignore_mismatched_sizes=True)
modelpeft = PeftModel.from_pretrained(inference_model, peft_model_id)
nlpeft = pipeline("ner", model=modelpeft, tokenizer=tokenizerpeft)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([11]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([11, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForTokenClassification' is not supported for ner. Supported models are ['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BioGptForTokenClassification', 'BloomForTokenClassification', 'CamembertForTokenClassification', 'CanineForTokenClassification', 'ConvBertForTokenClassification', 'Data2VecTextForTokenClassification', 'DebertaForTokenClassification', 'DebertaV2ForTokenClassification', 'DistilBertForTokenClas

In [8]:
names_in_paper('data/htmls/2022ApJ...937....5S.html')

['CrabNebula',
 'FRB201211',
 'FRB20180916B',
 'NGC13131',
 'FRB20190520B',
 'FRB20200120',
 'AXIS',
 'SS433',
 'NGC7793',
 'NGC779',
 'FRB20121102',
 'NGC5408X',
 'M51']

In [6]:
names_in_paper('data/htmls/2022ApJ...937....5S.html')

['FRB201211', 'FRB20190520', 'UX1', 'NGC1313X', 'NGC779', 'NGC5408X', 'M51']

In [None]:
outdir = 'data/lable_locations2/'

count = 0
with open('listofnames.txt', 'w') as file:

    for prepfilename in tqdm(os.listdir(outdir)):
        if count>200:
            break
        prepfilepath = os.path.join(outdir, prepfilename)
        if os.path.isfile(prepfilepath):
            # Read in html sections and tables
            start_time = time.time()  # Time at the start of the iteration

            s = prepfilename.split('.')
            htmldir = 'data/'+s[0][0:4]+'-'+s[0][4:]+'-Vol'+s[3][0:3]+'/HTML/'            
            htmlfilepath = os.path.join(htmldir, prepfilename[0:19]+'.html')
            unique_galnames = names_in_paper(htmlfilepath)
    
            end_time = time.time()  # Time at the end of the iteration
            print(end_time-start_time,'seconds',unique_galnames)
            for wor in unique_galnames:
                file.write(wor + '\n')
    

In [None]:
prepfilename = '2022ApJ...936...10M_A10M.flt'
s = prepfilename.split('.')
htmldir = 'data/'+s[0][0:4]+'-'+s[0][4:]+'-Vol'+s[3][0:3]+'/HTML/'
htmlfilepath = os.path.join(htmldir, prepfilename[0:19]+'.html')
texts = get_html_text(htmlfilepath)
sentences = sent_tokenize(texts)
for s in sentences[30:350]:
    pred = nlpeft(s)
    print(format_pred_for_print(pred,s,conf=0.99))


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/shemmati/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True