In [1]:
import pandas as pd
import spacy
from spacy.tokens import Doc
from spacy.lang.en import English

# Name Entity Recognition Task


## Data Transformation and EDA

In [2]:
data = pd.read_csv('data/preprocessed_sentences.tsv', delimiter = '\t', usecols = ['gddid','sentid', 'title', 'sentence'])

In [3]:
data.head(3)

Unnamed: 0,gddid,sentid,title,sentence
0,54b43266e138239d8684efed,1,Development of the mixed conifer forest in nor...,Available online at www.sciencedirect.com Quat...
1,54b43266e138239d8684efed,2,Development of the mixed conifer forest in nor...,The Chihuahueños Bog record extends to over 15...
2,54b43266e138239d8684efed,3,Development of the mixed conifer forest in nor...,"An Artemisia steppe, then an open Picea woodla..."


In [4]:
data['gddid'].nunique()

204

In [5]:
sent_count = data.groupby('gddid').count()['sentid'].reset_index()
sent_count['sentid'].sum()

106640

In [6]:
data_list = []
for i in range(len(data)):
    context_dict = {}
    sentence = data.loc[i, 'sentence']
    context_dict['title'] = data.loc[i, 'title']
    context_dict['gddid'] = data.loc[i, 'gddid']
    context_dict['sentid'] = int(data.loc[i, 'sentid'])
    data_list.append([sentence, context_dict])

In [7]:
data_list[0]

['Available online at www.sciencedirect.com Quaternary Research 69 (2008) 263275 www.elsevier.com/locate/yqres Development of the mixed conifer forest in northern New Mexico and its relationship to Holocene environmental change R. Scott Anderson a, bRenata B. Jass b  1, Jaime L. Toney b  2, Craig D. Allen c, Luz M. Cisneros-Dozal d, Marcey Hess d, Jeff Heikoop d, Julianna Fessenden d a Center for Environmental Sciences & Education, Box 5694, Northern Arizona University, Flagstaff, AZ 86011, USA b Quaternary Sciences Program & Bilby Research Center, Box 6013, Northern Arizona University, Flagstaff, AZ 86011, USA c U.S. Geological Survey, Jemez Mountains Field Station, Bandelier National Monument, HCR-1, Box 115, Los Alamos, NM 87544, USA d Earth & Environmental Sciences Division, Hydrology, Geochemistry & Geology Group, EES-6, MS-D462, Los Alamos National Laboratory, Los Alamos, NM 87545, USA Received 28 April 2007 Available online 29 January 2008 Abstract Chihuahueños Bog (2925 m) in t

In [8]:
import json
json_string = json.dumps(data_list)
# print(json_string)

In [9]:
data = json.loads(json_string)

In [10]:
data[0]

['Available online at www.sciencedirect.com Quaternary Research 69 (2008) 263275 www.elsevier.com/locate/yqres Development of the mixed conifer forest in northern New Mexico and its relationship to Holocene environmental change R. Scott Anderson a, bRenata B. Jass b  1, Jaime L. Toney b  2, Craig D. Allen c, Luz M. Cisneros-Dozal d, Marcey Hess d, Jeff Heikoop d, Julianna Fessenden d a Center for Environmental Sciences & Education, Box 5694, Northern Arizona University, Flagstaff, AZ 86011, USA b Quaternary Sciences Program & Bilby Research Center, Box 6013, Northern Arizona University, Flagstaff, AZ 86011, USA c U.S. Geological Survey, Jemez Mountains Field Station, Bandelier National Monument, HCR-1, Box 115, Los Alamos, NM 87544, USA d Earth & Environmental Sciences Division, Hydrology, Geochemistry & Geology Group, EES-6, MS-D462, Los Alamos National Laboratory, Los Alamos, NM 87545, USA Received 28 April 2007 Available online 29 January 2008 Abstract Chihuahueños Bog (2925 m) in t

In [11]:
new_data_list = []
for obs in range(len(data)):
    if type(data[obs][0]) != float:
        new_data_list.append(data[obs])

In [12]:
len(data)

106640

In [13]:
len(new_data_list)

106494

In [14]:
json_string = json.dumps(new_data_list)

In [15]:
with open('sentences_data.json', 'w') as f:
    json.dump(json_string, f)

## How to write coordinates?

### REGEX Library

In [17]:
import re
pattern = r'([-]?\d{1,3}\.\d{1,}[,]?[NESWnesw][\s|,|\']+?[-]?\d{1,3}\.\d{1,}[,]?[NESWnesw])'
results = re.findall(pattern, "This line of latitude and longitude would be written as, -15 °24'15\"N, 30°10'3\"E, 77.0364S, 38.8951N")

In [18]:
results

['77.0364S, 38.8951N']

### Spacy Library

In [19]:
import json
import spacy
from spacy.tokens import Doc
from spacy.lang.en import English
from spacy.matcher import Matcher

In [20]:
nlp = spacy.load('en_core_web_md')
matcher = Matcher(nlp.vocab)

In [21]:
sentence1 = nlp("For example, say you have a line of latitude at -15°N, 24 minutes, and 15 seconds. You have a line of longitude at 19 E, 30°E, 10 minutes, and 3 seconds.")
sentence2 = nlp("This line of latitude and longitude would be written as, -15 °24' 15\"N, 30° 10' 3\" E, 77.0364 S, 38.8951 N")

In [22]:
print([token.text for token in sentence2])

['This', 'line', 'of', 'latitude', 'and', 'longitude', 'would', 'be', 'written', 'as', ',', '-15', '°', '24', "'", '15"N', ',', '30', '°', '10', "'", '3', '"', 'E', ',', '77.0364', 'S', ',', '38.8951', 'N']


In [23]:
pattern = [[{'LIKE_NUM': True}, {"TEXT": {"REGEX" : "[°|o|◦|′|\'|`|\"|,]"}, "OP": "?"}, {'LIKE_NUM': True}, {"TEXT": {"REGEX" : "[°|o|◦|′|\'|`|\"|,]"}, "OP": "?"}, {'LIKE_NUM': True}, {"TEXT": {"REGEX" : "[°|o|◦|′|\'|`|\"|,]"}, "OP": "?"}]]

#pattern2 = [{"LOWER": {"REGEX": "([-]?\d{1,3}\.\d{1,}[,]?[NESWnesw])"}}, {"TEXT":","}, {"LOWER": {"REGEX": "([-]?\d{1,3}\.\d{1,}[,]?[NESWnesw])"}}]
#pattern3 = [{'LIKE_NUM': True, "OP": "+"}, {"TEXT": "°", "OP": "?"}, {"LOWER": {"REGEX": "^[nesw]$"}}]
#pattern4 = [{'LIKE_NUM': True}, {"TEXT": {"REGEX" : "[°|o|◦|′|\'|`|\"|,]"}, "OP": "?"}, {'LIKE_NUM': True}, {"TEXT": {"REGEX" : "[°|o|◦|′|\'|`|\"|,]"}, "OP": "?"}, {"LOWER": {"REGEX": "([-]?\d{1,3}[\"NESW])"}}]
#pattern5 = [{'LIKE_NUM': True}, {"TEXT": "°", "OP": "?"}, {"LOWER": {"REGEX": "^[nesw]$"}}, {"TEXT": ","}, {'LIKE_NUM': True}, {"LOWER": "minutes"}, {"TEXT": ","}, {"LOWER":"and"},{'LIKE_NUM': True}, {"LOWER": "seconds"}]

In [25]:
matcher.add("coords", pattern)

In [26]:
matches = matcher(sentence2)

In [27]:
for match_id, start, end in matches:
    # Get the matched span
    matched_span = sentence2[start:end]
    print(matched_span.text)

30° 10' 3
30° 10' 3"
