### Tokenize and Lemmatize inputs

Source: lecture notebooks + https://gist.github.com/4OH4/f727af7dfc0e6bb0f26d2ea41d89ee55

In [2]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import pickle

import spacy
import numpy as np

In [3]:
train = pd.read_csv('data/train.csv')
valid = pd.read_csv('data/valid.csv')
test = pd.read_csv('data/valid.csv')

In [94]:
train = train.dropna(subset=['title', 'abstract']).reset_index()
valid = valid.dropna(subset=['title', 'abstract']).reset_index()
test = test.dropna(subset=['title', 'abstract']).reset_index()

In [None]:
with open('map_labels.json', 'r') as f:
    map_labels = json.load(f)

In [101]:
def preprocess(dat=object):
    
    dat = dat.fillna('')
    dat['input'] = dat['title']+' '+dat['abstract']
    dat['key'] = 'PMID:'+dat['pmid'].astype(str)
    
    # generate vector of Boolean values, for labels
    label_vec_idx = list()

    for i in range(len(dat)):
        labels = dat.loc[i, 'label'].split(';')
        label_vec_idx.append(list(map(map_labels.get, labels)))
        
    label_vec = [[0]*9 for i in range(len(label_vec_idx))]

    for i in range(len(label_vec_idx)):
        for j in label_vec_idx[i]:
            label_vec[i][j] = 1
            
    dat['label_vec'] = label_vec
    
    # initialize dictionary
    dat_dict = {dat['key'][i]: {'input': dat['input'][i], 
                                'label': dat['label'][i],
                                'label_vec': dat['label_vec'][i]}
                for i in range(len(dat))}
    
#     load_model = spacy.load('en_core_web_md', disable = ['parser','ner'])
    load_model = spacy.load('en_core_web_md')

    stop_words = set(stopwords.words('english'))
    
    # Lemmatize, NER, add to dictionary
    for i in dat_dict:
        doc = load_model(dat_dict[i]['input'])
        dat_dict[i]['lemmas'] = [token.lemma_ for token in doc 
                                 if token.lemma_ not in stop_words
                                 and len(token.lemma_) > 1 ]
#         dat_dict[i]['ent_iob'] = [token.ent_iob_ for token in doc]
#         dat_dict[i]['ent_type'] = [token.ent_type_ for token in doc]

    
    # Generate word embeddings for each lemma
    nlp = spacy.load('en_core_web_md')
    
    for i in dat_dict:
        embeddings = {j: nlp.vocab[j].vector for j in dat_dict[i]['lemmas']}
        sentenceMatrix = np.array([embeddings[i] for i in embeddings])
        dat_dict[i]['embeddings'] = sentenceMatrix
    
    return dat_dict
    


In [102]:
%%time

train_dict = preprocess(train)
valid_dict = preprocess(valid)
test_dict = preprocess(test)

CPU times: user 31min 49s, sys: 3min 32s, total: 35min 21s
Wall time: 36min 50s


In [105]:
pickle.dump(train_dict, open('data/train.pkl', 'wb'))
pickle.dump(valid_dict, open('data/valid.pkl', 'wb'))
pickle.dump(test_dict, open('data/test.pkl', 'wb'))

In [30]:
# pickle.dump(valid_dict, open('data/valid_ner.pkl', 'wb'))