## Machine Learning Record Mining

Project to create a pipeline that uses GeoDeepDive's output to find Unaquired Sites for Neotoma.

Using NLP parsed text and a Data Science approach, identify whether a paper is suitable for Neotoma and detect features such as 'Site Name', 'Location', 'Age Span' and 'Site Descriptions'.

In [42]:
# Import libraries

import numpy as np
import pandas as pd
import csv
import psycopg2

In [50]:
# Loading the DATA

# TODO add try to select data from postgres if available

# Try 1st
# Connect to PostgreSQL server

conn = psycopg2.connect("dbname=gdd_database user=seiryu8808 password=")
data = pd.read_sql_query('''SELECT * FROM sentences;''', conn)

# If no SQL db, load from a file
header_list = ["_gddid", "sentence", "wordIndex", "word", "partofspeech", "specialclass", 
               "wordsAgain", "wordtype", "wordmodified"]

data2 = pd.read_csv("Do_not_commit_data/sentences_nlp352", sep='\t', names = header_list)

In [52]:
data.head(100)

Unnamed: 0,docid,sentid,wordidx,words,poses,ners,lemmas,dep_paths,dep_parents
0,54b43266e138239d8684efed,1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Available, online, at, www.sciencedirect.com,...","[JJ, NN, IN, NNP, NNP, NNP, CD, -LRB-, CD, -RR...","[O, O, O, O, O, O, NUMBER, O, DATE, O, NUMBER,...","[available, online, at, www.sciencedirect.com,...","[dep, dep, dep, dep, dep, dep, dep, , dep, , d...","[218, 218, 218, 218, 218, 218, 218, 0, 218, 0,..."
1,54b43266e138239d8684efed,2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[The, Chihuahueños, Bog, record, extends, to, ...","[DT, NNP, NN, NN, VBZ, TO, IN, CD, JJ, NN, NN, .]","[O, O, O, O, O, O, O, NUMBER, O, DURATION, O, O]","[the, Chihuahueños, bog, record, extend, to, o...","[det, compound, compound, nsubj, , case, amod,...","[4, 4, 4, 5, 0, 11, 11, 11, 11, 11, 5, 0]"
2,54b43266e138239d8684efed,3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[An, Artemisia, steppe, ,, then, an, open, Pic...","[DT, NNP, NN, ,, RB, DT, JJ, NNP, NN, VBD, IN,...","[O, O, O, O, O, O, O, LOCATION, LOCATION, O, O...","[a, Artemisia, steppe, ,, then, a, open, Picea...","[det, compound, nsubj, , advmod, det, amod, co...","[3, 3, 10, 0, 9, 9, 9, 9, 10, 0, 14, 14, 14, 1..."
3,54b43266e138239d8684efed,4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[C/N, ratios, ,, δ13C, and, δ15N, values, indi...","[JJ, NNS, ,, NN, CC, NN, NNS, VBP, CC, JJ, CC,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[c/n, ratio, ,, δ13c, and, δ15n, value, indica...","[amod, compound, , conj:and, cc, conj:and, nsu...","[7, 7, 0, 2, 2, 2, 8, 0, 10, 16, 10, 14, 14, 1..."
4,54b43266e138239d8684efed,5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Higher, percentages, of, aquatic, algae, and,...","[JJR, NNS, IN, JJ, NN, CC, JJ, NN, NNS, VBP, J...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[higher, percentage, of, aquatic, alga, and, e...","[amod, nsubj, case, amod, nmod:of, cc, amod, c...","[2, 10, 5, 5, 2, 5, 9, 9, 5, 0, 13, 13, 10, 16..."
...,...,...,...,...,...,...,...,...,...
95,54b43266e138239d8684efed,84,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Pollen, slides, were, scanned, at, 400, ×, ,,...","[NN, NNS, VBD, VBN, IN, CD, NN, ,, TO, DT, JJ,...","[O, O, O, O, O, NUMBER, O, O, O, O, O, O, O, O...","[pollen, slide, be, scan, at, 400, ×, ,, to, a...","[compound, nsubjpass, auxpass, , case, nummod,...","[2, 4, 4, 0, 7, 7, 4, 0, 13, 13, 13, 13, 4, 16..."
96,54b43266e138239d8684efed,205,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[Heavy, δ13C, values, could, be, derived, from...","[JJ, NN, NNS, MD, VB, VBN, IN, DT, NN, IN, NNS...","[O, O, O, O, O, O, O, O, O, O, O, O]","[heavy, δ13c, value, could, be, derive, from, ...","[amod, compound, nsubjpass, aux, auxpass, , ca...","[3, 3, 6, 6, 6, 0, 9, 9, 6, 11, 9, 0]"
97,54b43266e138239d8684efed,85,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[The, pollen, sum, consisted, of, all, terrest...","[DT, NN, NN, VBD, IN, DT, JJ, NN, NNS, ,, CC, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[the, pollen, sum, consist, of, all, terrestri...","[det, compound, nsubj, , case, det, amod, comp...","[3, 3, 4, 0, 9, 9, 9, 9, 4, 0, 4, 4, 16, 13, 1..."
98,54b43266e138239d8684efed,86,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[Pollen, zones, were, defined, by, visual, ins...","[NN, NNS, VBD, VBN, IN, JJ, NN, IN, DT, VBN, N...","[O, O, O, O, O, O, O, O, O, O, O, O]","[pollen, zone, be, define, by, visual, inspect...","[compound, nsubjpass, auxpass, , case, amod, n...","[2, 4, 4, 0, 7, 7, 4, 9, 7, 9, 10, 0]"
