# Generate NER annotation from the 2nd annotation campaign


In [None]:
import spacy
import pandas as pd
import openpyxl
import os
from PyPDF2 import PdfReader
import glob

In [None]:
file_path = './data/quantitative_value_with_context_CC.xlsx'

# Read the "missing value" worksheet
missing_value_df = pd.read_excel(file_path, sheet_name='missing value')

# Read the 'quantitative_value_with context' worksheet
quantitative_value_df = pd.read_excel(file_path, sheet_name='quantitative_value_with context')
# quantitative_value_df = quantitative_value_df.rename(columns={'Relevant context for MOOD data extraction': 'context'})

missing_value_df['source'] = 'missing_value'
quantitative_value_df['source'] = 'quantitative_value_with_context'

# drop 3 lines corresponding to a Claudia's comment
"""
Missing values not found by the script. Articles: MB7, MB8, CC6-15-17-23-32
"""
missing_value_df = missing_value_df.drop(index=range(53, 56))

# concatenate the 2 dataframe
# df = pd.concat([missing_value_df, quantitative_value_df], ignore_index=True)
df = quantitative_value_df

usable_covariates = df[df["Relevant context for MOOD data extraction"].str.lower().isin(["yes", "Yes"])]
# usable_covariates = pd.concat([missing_value_df, usable_covariates])

full_text_annotation = usable_covariates[usable_covariates["Mood extraction from Table/Figure"].str.lower().isin(["no", "No"])]
table_annotations = usable_covariates[usable_covariates["Mood extraction from Table/Figure"].str.lower().isin(["Table", "table", "table and caption"])]
figure_annotations = usable_covariates[usable_covariates["Mood extraction from Table/Figure"].str.lower().isin(["figure", "Figure", "Figure caption"])]

In [None]:
usable_covariates["Mood extraction from Table/Figure"].unique()

In [None]:
annotation_for_training = full_text_annotation

# annotation_for_training

In [None]:
from transformers import AutoTokenizer
import re

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


token_list = []
label_list = []
ner_tags_list = []
for i, row in annotation_for_training.iterrows():
    texts = row["context"]
    labels_xls = row["non-standardized covariate in the context"].split(",")
    labels_xls = [x.strip(' ') for x in labels_xls]
    # sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", texts)
    sentence = texts

    # for sentence in sentences:
    tokens = []
    labels = []
    ner_tags = []
    tokenized_sentence = tokenizer.tokenize(sentence)
    for token in tokenized_sentence:
        if token.startswith("##"):  # Subword token
            tokens[-1] += token[2:]  # Concatenate to the previous token
        else:
            tokens.append(token)
    token_list.append(tokens)                   
    labels = ["O"] * len(tokens)
    ner_tags = [0] * len(tokens)
    for i, tokens in enumerate(tokens):
        if any(tokens in target_token for target_token in labels_xls):
            labels[i] = "covariate"
            ner_tags[i] = 1
    label_list.append(labels)
    ner_tags_list.append(ner_tags)

print(token_list)
print(label_list)
print(ner_tags_list)

In [None]:
from datasets import Dataset

dataset = Dataset.from_dict({"tokens": token_list, "ner_tags": ner_tags_list, "labels": label_list})
dataset

In [None]:
i = 10
print(f'Token: {dataset[i]["tokens"]} \nLabel: {dataset[i]["ner_tags"]}')

In [None]:
splited_size = 0.4
dataset_dict = dataset.train_test_split(test_size=splited_size, shuffle=False)

dataset_dict.save_to_disk("./data/annotation_generated_from_xlsx/annotation.dataset")
dataset_dict

# 2. Generate new annotations using ChatGPT

In [None]:
import spacy
import pandas as pd
import openpyxl
import os
from PyPDF2 import PdfReader
import glob
from datasets import Dataset

In [None]:
import openai
from getpass import getpass

openai.api_key = getpass("OpenAI API Key: ")

In [None]:
prompt = f"""
For a Name Entity Recognition task, I want to generate other training data based on those 2 examples bellow. 
The main objectif is to find, in scientific articles, risk factors (that we call 'covariate') that impact the spreading of disease

Example 1:
sentence: [{annotation_for_training["context"].iloc[0]}]
Covariate: [{annotation_for_training["non-standardized covariate in the context"].iloc[0]}]

Example 2:
sentence: [{annotation_for_training["context"].iloc[3]}]
Covariate: [{annotation_for_training["non-standardized covariate in the context"].iloc[3]}]

I need you to generate completly 20 news sentences in a python list format and another list which contains the exact names of covariates.
Don't give explaination, only return python3 code containing two line: 1rst sentence_list then 2nd covariate_list, and nothing else. Don't give name to list. Don't indent
"""

response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo-0301",  # Specify your chat model name
  messages=[{"role": "system", "content": "You are chatBot that provide 2 lists"},
            {"role": "user", "content": prompt}],
  # max_tokens=200,
  n=10  # Number of samples to generate
)

In [None]:
generated_texts = [message["message"]["content"] for message in response["choices"]]

list_sentences = []
list_covariates = []

for text in generated_texts:
    try:
        sentences = eval(text.split("\n\n")[0])
        covariates = eval(text.split("\n\n")[1])
        list_sentences.extend(sentences)
        list_covariates.extend(covariates)
    except:
        pass

list_ner_tags = []
list_tokens = []
list_labels = []
for j, sentence in enumerate(list_sentences):
    tokens = []
    ner_tags = []
    tokenized_sentence = tokenizer.tokenize(sentence)
    for token in tokenized_sentence:
        if token.startswith("##"):  # Subword token
            tokens[-1] += token[2:]  # Concatenate to the previous token
        else:
            tokens.append(token)
    list_tokens.append(tokens)                   
    labels = ["O"] * len(tokens)
    ner_tags = [0] * len(tokens)
    for i, tokens in enumerate(tokens):
        try:
            if (tokens in list_covariates[j]):
                ner_tags[i] = 1
                labels[i] = "covariate"
        except:
            print(f"j: {j} | tokens: {tokens} | list_covariates: {list_covariates}")
    list_ner_tags.append(ner_tags)
    list_labels.append(labels)

In [None]:
print(f"{len(list_tokens)}: {list_tokens}")
# print(f"{len(list_covariates)}: {list_covariates}")
print(f"{len(list_ner_tags)}: {list_ner_tags}")
print(f"{len(list_labels)}: {list_labels}")

In [None]:
dataset_chatgpt = Dataset.from_dict({"tokens": list_tokens, "ner_tags": list_ner_tags, "labels": list_labels})
dataset_chatgpt

In [None]:
from datasets import concatenate_datasets

dataset_dict["train"] = concatenate_datasets([dataset_dict["train"], (dataset_chatgpt)])
dataset_dict.save_to_disk("./data/annotation_generated_from_xlsx/annotation_chatgpt_augmented.dataset")

dataset_dict

In [None]:
generated_texts