First need to see the form of the datasets that are prepared. There are four types:
- devel.tsv
- test.tsv
- train.tsv
- train_dev.tsv
Will only be training on the NCBI Disease, 2010 i2b2/VA, BC5DR disease sets + my own records and training sets

In [1]:
import numpy as np
import pandas as pd

In [2]:
#See what the .tsv file looks like, can remove later
#train_dev = "./datasets/NER/NCBI-disease/train_dev.tsv"
#train = "./datasets/NER/BC5CDR-disease/train.tsv" 
test = "./datasets/NER/BC5CDR-disease/test.tsv"
#train_dev = "./datasets/NER/BC5CDR-disease/train_dev.tsv"
#devel = "./datasets/NER/BC5CDR-disease/devel.tsv"

#Used this to find out if they used 80:20 split for training/validation - they did not
#train_df = pd.read_csv(train, sep='\t',engine='python',error_bad_lines=False)
test_df = pd.read_csv(test, sep='\t', names= ["Word","Label"])
#train_dev_df = pd.read_csv(train_dev, sep='\t',engine='python',error_bad_lines=False)
#devel_df = pd.read_csv(devel, sep='\t')
#print(len(train_df.index), len(test_df.index), len(train_dev_df.index), len(devel_df.index))

In [3]:
test_df

Unnamed: 0,Word,Label
0,Torsade,B
1,de,I
2,pointes,I
3,ventricular,B
4,tachycardia,I
...,...,...
124671,monitored,O
124672,for,O
124673,gingival,B
124674,hyperplasia,I


In [4]:
from datasets import load_dataset
coNLL = load_dataset("conllpp")
coNLL

Reusing dataset conllpp (/work/wzkariampuzha/.cache/huggingface/datasets/conllpp/conllpp/1.0.0/04f15f257dff3fe0fb36e049b73d51ecdf382698682f5e590b7fb13898206ba2)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [5]:
coNLL["train"][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [6]:
#NER_tag '5' is B-LOC, '6' is I-LOC
loc_data = []
for sentence in coNLL["train"]:
    #Only add sentences that actually have location tags (i.e. meaningfully annotated sentences)
    if (5 in sentence['ner_tags'] or 6 in sentence['ner_tags']):
        i = 0
        for tag in sentence['ner_tags']:
            label = 'O'
            if tag ==5:
                label = 'B'
            if tag == 6:
                label = 'I'
            entry = (sentence['tokens'][i], label) #Adding this as a tuple
            loc_data.append(entry)
            i+=1
loc_data[:25]

[('BRUSSELS', 'B'),
 ('1996-08-22', 'O'),
 ('Germany', 'B'),
 ("'s", 'O'),
 ('representative', 'O'),
 ('to', 'O'),
 ('the', 'O'),
 ('European', 'O'),
 ('Union', 'O'),
 ("'s", 'O'),
 ('veterinary', 'O'),
 ('committee', 'O'),
 ('Werner', 'O'),
 ('Zwingmann', 'O'),
 ('said', 'O'),
 ('on', 'O'),
 ('Wednesday', 'O'),
 ('consumers', 'O'),
 ('should', 'O'),
 ('buy', 'O'),
 ('sheepmeat', 'O'),
 ('from', 'O'),
 ('countries', 'O'),
 ('other', 'O'),
 ('than', 'O'),
 ('Britain', 'B'),
 ('until', 'O'),
 ('the', 'O'),
 ('scientific', 'O'),
 ('advice', 'O'),
 ('was', 'O'),
 ('clearer', 'O'),
 ('.', 'O'),
 ('Fischler', 'O'),
 ('proposed', 'O'),
 ('EU-wide', 'O'),
 ('measures', 'O'),
 ('after', 'O'),
 ('reports', 'O'),
 ('from', 'O'),
 ('Britain', 'B'),
 ('and', 'O'),
 ('France', 'B'),
 ('that', 'O'),
 ('under', 'O'),
 ('laboratory', 'O'),
 ('conditions', 'O'),
 ('sheep', 'O'),
 ('could', 'O'),
 ('contract', 'O'),
 ('Bovine', 'O'),
 ('Spongiform', 'O'),
 ('Encephalopathy', 'O'),
 ('(', 'O'),
 ('BSE', '

In [7]:
import csv

def save_labels(loc_data, label_file):
    with open(label_file, 'w', encoding='utf8', newline='') as tsv_file:
        tsv_writer = csv.writer(tsv_file, delimiter='\t', lineterminator='\n')
        #tsv_writer.writerow(["Word", "Count"])
        for word, label in loc_data:
            tsv_writer.writerow([word, label])
#From https://stackoverflow.com/questions/29895602/how-to-save-output-from-python-like-tsv
#WWHre word_count is a list of tuples like this:
#[('the', 222594), ('to', 61479), ('in', 52540), ('of', 48064) ... ]

In [8]:
label_file = "./datasets/NER/labels.tsv"
save_labels(loc_data, label_file)

In [9]:
df = pd.read_csv(label_file, sep='\t', names= ["Word","Label"]) 
df.tail()

Unnamed: 0,Word,Label
84361,74,O
84362,77,O
84363,.,O
84364,LONDON,B
84365,1996-08-30,O


In [10]:
train_test_split = 0.8
half = int(len(df.index)/2)
butterknife = int(train_test_split*len(df.index))

df_1 = df[:half]
df_2 = df[half:]

train_df = df_1[:butterknife]
test_df = df_1[butterknife:]
train_dev_df = df_2[:butterknife]
devel_df = df_2[butterknife:]

In [11]:
#Save files
train = "./datasets/NER/Location/train.tsv" 
test = "./datasets/NER/Location/test.tsv"
train_dev = "./datasets/NER/Location/train_dev.tsv"
devel = "./datasets/NER/Location/devel.tsv"

train_df.to_csv(train,sep='\t',header=False, index = False)
test_df.to_csv(test,sep='\t',header=False, index = False)
train_dev_df.to_csv(train_dev,sep='\t',header=False, index = False)
devel_df.to_csv(devel,sep='\t',header=False, index = False)

This is using the train_test_split module, did not use it because BERT trains contextually so randomizing the words are most likely worse.

In [12]:
#from sklearn.model_selection import train_test_split
#x, y = train_test_split(df, train_size = 0.8)

In [13]:
#x

In [14]:
#y

In [15]:
#df.groupby('Label').size().reset_index(name='Counts')

In [16]:
#X = df.drop('Label', axis=1)
#print(X.to_dict('records')[:10])

In [17]:
'''
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

X = df.drop('Label', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Label.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=0)
X_train.shape, y_train.shape
'''

"\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction import DictVectorizer\n\nX = df.drop('Label', axis=1)\nv = DictVectorizer(sparse=False)\nX = v.fit_transform(X.to_dict('records'))\ny = df.Label.values\nclasses = np.unique(y)\nclasses = classes.tolist()\nX_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=0)\nX_train.shape, y_train.shape\n"

In [18]:
#classes

In [19]:
#from sklearn.linear_model import Perceptron
#per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
#per.partial_fit(X_train, y_train, classes)

In [20]:
#new_classes = classes.copy()
#new_classes.pop()
#new_classes

In [21]:
#from sklearn.metrics import classification_report
#print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))