In [None]:
import zipfile
import spacy
import pandas as pd
import json

In [2]:
input_text = []
annotated = []
words_queue = []
files = set()
with zipfile.ZipFile('MACCROBAT2018.zip', 'r') as z:
    for filename in z.namelist():
        files.add(filename.split(".")[0])

    for filename in files:
        with z.open(filename + ".txt") as file:
            for line in file:
                sentence = line.decode('utf-8').strip()
                if sentence:
                    input_text.append(sentence)

with zipfile.ZipFile('MACCROBAT2020.zip', 'r') as z:
    for filename in z.namelist():
        files.add(filename.split(".")[0])

    for filename in files:
        with z.open(filename + ".txt") as file:
            for line in file:
                sentence = line.decode('utf-8').strip()
                if sentence:
                    input_text.append(sentence)

In [3]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def generate_detailed_tags(sentences):
    detailed_tags = []

    for index, sentence in enumerate(sentences):
        if index % 1000 == 0:
            print(index)

        # Process the sentence with spaCy
        doc = nlp(sentence)

        # Initialize BIO tags as "O" for all tokens
        bio_tags = ["O"] * len(doc)

        # Assign B- and I- tags based on entities
        for ent in doc.ents:
            start = ent.start
            end = ent.end

            # First token in the entity gets B- prefix
            bio_tags[start] = f"B-{ent.label_}"

            # Subsequent tokens in the entity get I- prefix
            for i in range(start + 1, end):
                bio_tags[i] = f"I-{ent.label_}"

        # Collect tokens, POS tags, and BIO tags
        tokens = [token.text for token in doc]
        pos_tags = [token.pos_ for token in doc]

        # Add to the output
        detailed_tags.append({
            "sentence": sentence,
            "tokens": tokens,
            "pos_tags": pos_tags,
            "ner_tags": bio_tags
        })

    return detailed_tags


In [4]:
# Generate detailed tags
detailed_tags_output = generate_detailed_tags(input_text)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [5]:
# Print output
for entry in detailed_tags_output[:5]:
    print(f"Sentence: {entry['sentence']}")
    print(f"Tokens: {entry['tokens']}")
    print(f"POS Tags: {entry['pos_tags']}")
    print(f"NER Tags: {entry['ner_tags']}")
    print()

Sentence: A 46-year-old Caucasian woman with type 2 diabetes mellitus and bipolar disorder presented to our emergency department with vague abdominal symptoms and vomiting.
Tokens: ['A', '46', '-', 'year', '-', 'old', 'Caucasian', 'woman', 'with', 'type', '2', 'diabetes', 'mellitus', 'and', 'bipolar', 'disorder', 'presented', 'to', 'our', 'emergency', 'department', 'with', 'vague', 'abdominal', 'symptoms', 'and', 'vomiting', '.']
POS Tags: ['DET', 'NUM', 'PUNCT', 'NOUN', 'PUNCT', 'ADJ', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NUM', 'NOUN', 'ADJ', 'CCONJ', 'ADJ', 'NOUN', 'VERB', 'ADP', 'PRON', 'NOUN', 'NOUN', 'ADP', 'ADJ', 'ADJ', 'NOUN', 'CCONJ', 'NOUN', 'PUNCT']
NER Tags: ['O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'B-NORP', 'O', 'O', 'O', 'B-CARDINAL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Sentence: Her pertinent history includes left below knee amputation and right toes amputation for complications secondary to diabetic neuropathy.
T

In [None]:
# Save to a JSON file
with open("tagged_medical_sentences.json", "w") as f:
    json.dump(detailed_tags_output, f, indent=4)

print("Output saved to 'tagged_medical_sentences.json'")

Output saved to 'tagged_medical_sentences.json'


In [8]:
def load_data(data_path):
    data = pd.read_csv(data_path)
    data.dropna(inplace=True)
    print("Number of rows : ",data.shape[0]," and the number of columns : ",data.shape[1])
    return data

Unzip general.zip

In [10]:
data = load_data("ner.csv")

Number of rows :  47959  and the number of columns :  4


In [15]:
# Generate detailed tags
detailed_tags_output = generate_detailed_tags(data["Sentence"].tolist())

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000


In [16]:
# Print output
for entry in detailed_tags_output[:5]:
    print(f"Sentence: {entry['sentence']}")
    print(f"Tokens: {entry['tokens']}")
    print(f"POS Tags: {entry['pos_tags']}")
    print(f"NER Tags: {entry['ner_tags']}")
    print()

Sentence: Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
Tokens: ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
POS Tags: ['NOUN', 'ADP', 'NOUN', 'AUX', 'VERB', 'ADP', 'PROPN', 'PART', 'VERB', 'DET', 'NOUN', 'ADP', 'PROPN', 'CCONJ', 'VERB', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'PUNCT']
NER Tags: ['B-CARDINAL', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'B-NORP', 'O', 'O', 'O', 'O', 'O']

Sentence: Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "
Tokens: ['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined', 'the', 

In [17]:
# Save to a JSON file
with open("tagged_general_sentences.json", "w") as f:
    json.dump(detailed_tags_output, f, indent=4)

print("Output saved to 'tagged_general_sentences.json'")

Output saved to 'tagged_general_sentences.json'
