## Step 0: Load Libraries

In [None]:
!pip install openai datasets transformers

Collecting openai
  Downloading openai-1.3.9-py3-none-any.whl (221 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.4/221.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.25.2-py3-none-any.whl (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0

In [None]:
# !pip install openai
!pip install datasets



In [None]:
import openai
import os

import json

from transformers import BertModel, BertConfig, BertTokenizer

from datasets import load_dataset

## Step 1: Setup APIs and Dataset


In [None]:
%env OPENAI_API_KEY = {}#include key here
openai.api_key = os.environ['OPENAI_API_KEY']

env: OPENAI_API_KEY={}#include key here


In [None]:
conll = load_dataset("conll2003")
wnut = load_dataset("wnut_17")

conll_idx_ner_tags = {0:'O', 1:'B-PER', 2:'I-PER', 3:'B-ORG', 4:'I-ORG', 5:'B-LOC', 6:'I-LOC', 7:'B-MISC', 8:'I-MISC'}
wnut_idx_ner_tags = {0: 'O', 1: 'B-corporation', 2: 'I-corporation', 3: 'B-creative-work', 4: 'I-creative-work', 5: 'B-group', 6: 'I-group', 7: 'B-location', 8: 'I-location', 9: 'B-person', 10: 'I-person', 11: 'B-product', 12: 'I-product'}

In [None]:
def get_input_output(data, idx_ner_tags):
  tokens = data["tokens"]
  ner_labels = data["ner_tags"]

  mod_tokens = [idx_ner_tags[ner_labels[i]] for i in range(len(tokens))]

  input = " ".join(tokens)
  output = " ".join(mod_tokens)

  return input, output

In [None]:
def generate_json(data, filename, tag_roots, idx_ner_tags):
  named_entities = {}
  named_entities["tags"] = tag_roots
  named_entities["sentences"] = []

  #tag_roots = ["PER", "ORG", "LOC", "MISC"]

  for s in range(len(data)):
    sentence = data[s]
    N = len(sentence["tokens"])

    named_entity = {}

    #Generate Sentences + Their Labels
    tokens = sentence["tokens"]
    ner_labels = sentence["ner_tags"]

    ner_tokens = [idx_ner_tags[ner_labels[i]] for i in range(len(tokens))]

    #named_entity["tokens"] = tokens
    #named_entity["ner_tokens"] = ner_tokens

    named_entity["id"] = s

    named_entity["sentence"] = " ".join(tokens)
    named_entity["ner_sentence"] = " ".join(ner_tokens)

    #Generate Entities
    named_entity["entities"] = {}

    for root in tag_roots:
      named_entity['entities'][root] = []

    for i in range(N):
      for root in tag_roots:
        btag = "B-" + root

        if idx_ner_tags[sentence["ner_tags"][i]] == btag:
          j = i + 1

          itag = "I-" + root

          while j < N and idx_ner_tags[sentence["ner_tags"][j]] == itag:
            j += 1

          named_entity['entities'][root].append(" ".join(sentence["tokens"][i:j]))

    named_entities["sentences"].append(named_entity)

  with open(filename, "w") as outfile:
      json.dump(named_entities, outfile)

In [None]:
conll_tag_roots = ["PER", "ORG", "LOC", "MISC"]

generate_json(conll["train"], "conll_train.json", conll_tag_roots, conll_idx_ner_tags)
generate_json(conll["test"], "conll_test.json", conll_tag_roots, conll_idx_ner_tags)
generate_json(conll["validation"], "conll_val.json", conll_tag_roots, conll_idx_ner_tags)

In [None]:
wnut_tag_roots = ["corporation", "creative-work", "group", "location", "person", "product"]

generate_json(wnut["train"], "wnut_train.json", wnut_tag_roots, wnut_idx_ner_tags)
generate_json(wnut["test"], "wnut_test.json", wnut_tag_roots, wnut_idx_ner_tags)
generate_json(wnut["validation"], "wnut_val.json", wnut_tag_roots, wnut_idx_ner_tags)

## Step 2: Test Prompting

range(0, 0)

## Step 3: Prompt Tuning/Few Shot Learning w Prompts

## Step 4: Maybe -- Finetuning Network

## Step 5: Collect Results