<a target="_blank" href="https://colab.research.google.com/github/wbfrench1/barker_DATA606/blob/main/src/Create_spaCy_NER_model_dataset.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

Followed this tutorial
https://colab.research.google.com/github/practical-nlp/practical-nlp/blob/master/Ch5/04_NER_using_spaCy%20-%20CoNLL.ipynb#scrollTo=X4wBa1MGwTwy

In [1]:
import os
from google.colab import drive
import pandas as pd
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 150)
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/wbfrench1/barker_DATA606/main/data/movie_data_word_pos_iob_w_space_sep.iob', sep=' ')
print(df.shape)
df.head()

(322035, 4)


Unnamed: 0,Sentence #,Word,POS,Tag
0,1,what,WP,o
1,1,movies,NNS,o
2,1,star,VBP,o
3,1,bruce,NN,b-actor
4,1,willis,NN,i-actor


#### Add periods to the end of each sentence

<p>In the test data at https://github.com/explosion/spaCy/blob/master/extra/example_data/ner_example_data/ner-token-per-line-with-pos.iob   it appears that each sentence should have a period at the end of it.</p>

In [3]:
# select indices where the label in Sentence # is != to the
## previous Sentence
# https://stackoverflow.com/questions/72726217/get-index-of-row-where-column-value-changes-from-previous-row
l_indices = df.index[df['Sentence #'].diff().eq(1)]

In [4]:
# https://stackoverflow.com/questions/15888648/is-it-possible-to-insert-a-row-at-an-arbitrary-position-in-a-dataframe-using-pan
# get an index location between each sentence 
l_new_indicies = l_indices - .5
l_new_indicies

Float64Index([     4.5,     13.5,     22.5,     35.5,     46.5,     53.5,     62.5,     71.5,     81.5,     89.5,
              ...
              321827.5, 321842.5, 321859.5, 321880.5, 321904.5, 321923.5, 321968.5, 321974.5, 321995.5, 322016.5],
             dtype='float64', length=21986)

In [None]:
for sent_num, f_index in enumerate(l_new_indicies):
    df.loc[f_index] = sent_num + 1, '.', '.', 'o'
    if sent_num % 1000 == 0:
        print(sent_num)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000


In [5]:
# sort the indices and reset the index
df1 = df.loc[df.index.sort_values()].copy()
df1 = df1.reset_index().drop(columns='index').copy()

In [6]:
df1.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,1,what,WP,o
1,1,movies,NNS,o
2,1,star,VBP,o
3,1,bruce,NN,b-actor
4,1,willis,NN,i-actor
5,2,show,VB,o
6,2,me,PRP,o
7,2,films,NNS,o
8,2,with,IN,o
9,2,drew,NNS,b-actor


#  Does the data need to be randomized?
<p>I have movie trivia questions and iob labels.  The sentence structure and context gives the words and their labels meaning.  So I believe that it should not be randomized.</p>

<p>Question:  Beyond ensuring that the component words of each sentence stay together, do I need to ensure that the model only looks to complete sentences since one question is unrelated to the next? </p>

In [7]:
# since groups of words in a sentence should stay together, I must divide the 
## training/test/validation data into sets according at the sentence level

# Get the max number of sentences
df1['Sentence #'].max()

21987

In [9]:
df1['Tag'] = df1['Tag'].str.upper()

In [10]:
#https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test

# get an array of 1 to n questions
a_q_num = np.arange(1,21988)

In [16]:
#divvy the questions up into groups.  Since questions should stay together
## randomization is not an issue.
l_a_indices_of_train_test_val_splits = np.split(a_q_num, [int(.6 * len(a_q_num)), int(.8 * len(a_q_num))])
l_a_indices_of_train_test_val_splits

[array([    1,     2,     3, ..., 13190, 13191, 13192]),
 array([13193, 13194, 13195, ..., 17587, 17588, 17589]),
 array([17590, 17591, 17592, ..., 21985, 21986, 21987])]

In [19]:
df_train = df1.loc[df1['Sentence #'].isin(l_a_indices_of_train_test_val_splits[0])].copy()
df_test = df1.loc[df1['Sentence #'].isin(l_a_indices_of_train_test_val_splits[1])].copy()
df_val = df1.loc[df1['Sentence #'].isin(l_a_indices_of_train_test_val_splits[2])].copy()

print('df1_train.shape:', df_train.shape)
print('df1_test.shape:', df_test.shape)
print('df1_val.shape:', df_val.shape)

df1_train.shape: (144042, 4)
df1_test.shape: (89317, 4)
df1_val.shape: (88676, 4)


In [28]:
#https://stackoverflow.com/questions/31247198/python-pandas-write-content-of-dataframe-into-text-file
l_movie_iob_csv_datasets= ['train_movie_iob', 'test_movie_iob', 'val_movie_iob']

for str_data_set in l_movie_iob_csv_datasets:
    # save dataset as a csv
    df_train[ ['Word', 'POS', 'Tag']].to_csv(str_data_set,sep="\t", index=False, header=False)
    # removes the column headers from the file
    with open(str_data_set, 'w') as f:
        dfAsString = df_train[ ['Word', 'POS', 'Tag']].to_string(header=False, index=False)
        f.write(dfAsString)

In [31]:
!head 'val_movie_iob' -n 11 

                   what   WP              O
                 movies  NNS              O
                   star  VBP              O
                  bruce   NN        B-ACTOR
                 willis   NN        I-ACTOR
                   show   VB              O
                     me  PRP              O
                  films  NNS              O
                   with   IN              O
                   drew  NNS        B-ACTOR
              barrymore   RB        I-ACTOR


In [25]:
#Read the CONLL data from conll2003 folder, and store the formatted data into a folder spacyNER_data

# !mkdir spacyNER_data
if 'spacyNER_data' not in os.listdir():
    os.mkdir('spacyNER_data')

#the above lines create folder if it doesn't exist. If it does, the output shows a message that it
#already exists and cannot be created again
#!python -m spacy convert "/content/content/MyDrive/Colab Notebooks/0. Data 606 - Capstone/Project Data/train_movie_iob.txt" spacyNER_data -c ner
!python -m spacy convert  "train_movie_iob" spacyNER_data -c ner  -s -n 10 -b en_core_web_sm 

2023-03-04 15:22:36.140838: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 15:22:37.215360: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-04 15:22:37.215472: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-04 15:22:38.759047: E tensorfl

In [None]:
#Read the CONLL data from conll2003 folder, and store the formatted data into a folder spacyNER_data

# !mkdir spacyNER_data
os.mkdir('spacyNER_data')
        
#the above lines create folder if it doesn't exist. If it does, the output shows a message that it
#already exists and cannot be created again
try:
    import google.colab 
    !python -m spacy convert "train.txt" spacyNER_data -c ner
    !python -m spacy convert "test.txt" spacyNER_data -c ner
    !python -m spacy convert "valid.txt" spacyNER_data -c ner
except ModuleNotFoundError:
    !python -m spacy convert "Data/conll2003/en/train.txt" spacyNER_data -c ner
    !python -m spacy convert "Data/conll2003/en/test.txt" spacyNER_data -c ner
    !python -m spacy convert "Data/conll2003/en/valid.txt" spacyNER_data -c ner

2023-02-26 12:43:29.608149: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-26 12:43:31.085203: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-26 12:43:31.085373: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-26 12:43:33.055181: E tensorfl