In [3]:
import pandas as pd
import numpy as np
import re


In [4]:
pd.set_option('display.max_columns', None)

In [54]:
# change it if you use the different path
tweet_file = pd.read_csv('data/task3_training.tsv',delimiter='\t')

In [55]:
tweet_file.shape

(2246, 54)

## Tweet text cleaning


In [56]:
# discrad the unrelevant columns
tweet_file_2 = tweet_file.iloc[:,:9]

# change all the text to lowercase
tweet_file_2['cleaned_tweet'] = tweet_file_2['tweet'].apply(str.lower)

# remove @ mentions (remove the first '@' and following username)
tweet_file_2['cleaned_tweet'] = tweet_file_2['cleaned_tweet'].replace(to_replace=r'@([_A-Za-z]+[A-Za-z0-9-_]+)', value='', regex=True)

# remove all other non-standard characters (emoji, #, @....) ( For this task I thought '@'' and '#' are useless )
tweet_file_2['cleaned_tweet'] = tweet_file_2['cleaned_tweet'].replace(to_replace=r"""([^A-Za-z0-9 .,\-'"\/?!\\()=])+""", value='', regex=True)

# remove urls
tweet_file_2['cleaned_tweet'] = tweet_file_2['cleaned_tweet'].replace(to_replace=r'http\S+', value='', regex=True)

# Expand contractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
tweet_file_2['cleaned_tweet'] = tweet_file_2['cleaned_tweet'].apply(decontracted)


# Let me know if you think more cleaning processes need to be done!


## Tokenization & Lemmetization

In [57]:
# As reported by the papers, different tokenizers may have different performance so I tried different tokenizers,
# their performance should be decided by the classification results

# Trial 1, used the nltk.word_tokenizer
# import nltk
# from nltk.stem import WordNetLemmatizer

# word_lemmatizer = WordNetLemmatizer()
# def tokenize_lemmatize(twitter):
#     tokens = nltk.word_tokenize(twitter)
#     tokens_lemmetized = []
#     for token in tokens:
#         tokens_lemmetized.append(word_lemmatizer.lemmatize(token))
#     lemmed_twitter = ' '.join(tokens_lemmetized)
#     return lemmed_twitter

# tweet_tokenization_1 = tweet_file_2
# tweet_tokenization_1['cleaned_tweet'] = tweet_file_2['cleaned_tweet'].apply(tokenize_lemmatize)

# tweet_tokenization_1['cleaned_tweet']

## Just tokenization, no lemmatization of twitters

There will be some matching issue after lemmatization, so we just used tokenization of twitters

In [58]:
import nltk
tweet_tokenization_2 = tweet_file_2.copy()
tokenize = nltk.word_tokenize
tweet_tokenization_2['tokennized_cleaned_tweet'] = tweet_tokenization_2['cleaned_tweet'].apply(tokenize)
tweet_tokenization_2.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,cleaned_tweet,tokennized_cleaned_tweet
0,331187619096588288,,,,,ofloxacin,@seefisch:oral drugs for pyelonephritis:ciprof...,,,oral drugs for pyelonephritisciprofloxacin lev...,"[oral, drugs, for, pyelonephritisciprofloxacin..."
1,332227554956161024,,,,,trazodone,happy for wellbutrin; has similar effects as a...,,,happy for wellbutrin has similar effects as ad...,"[happy, for, wellbutrin, has, similar, effects..."
2,332448217490944000,,,,,lamotrigine,@stilgarg i'm ok ty have an official diagnosis...,,,i am ok ty have an official diagnosis of bipo...,"[i, am, ok, ty, have, an, official, diagnosis,..."
3,332977955754110976,,,,,cymbalta,i'm soo depressed cymbalta couldn't help me .,,,i am soo depressed cymbalta could not help me .,"[i, am, soo, depressed, cymbalta, could, not, ..."
4,333674203331051520,,,,,seroquel,time for my daily afternoon relaxation ritual ...,,,time for my daily afternoon relaxation ritual ...,"[time, for, my, daily, afternoon, relaxation, ..."


## Sequence labeling

In [59]:
#Fill the NAN with None 
tweet_tokenization_2.fillna('None',inplace=True)

# tokenize the already extracted label(AEs)
tweet_tokenization_2['extraction_token'] = tweet_tokenization_2['extraction'].apply(tokenize)
labeled_dataset_t = tweet_tokenization_2.copy()

# Create a empty column 'label' for sequence labeling 
labeled_dataset_t['labeled_sequence'] =''
labeled_dataset_t.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,cleaned_tweet,tokennized_cleaned_tweet,extraction_token,labeled_sequence
0,331187619096588288,,,,,ofloxacin,@seefisch:oral drugs for pyelonephritis:ciprof...,,,oral drugs for pyelonephritisciprofloxacin lev...,"[oral, drugs, for, pyelonephritisciprofloxacin...",[None],
1,332227554956161024,,,,,trazodone,happy for wellbutrin; has similar effects as a...,,,happy for wellbutrin has similar effects as ad...,"[happy, for, wellbutrin, has, similar, effects...",[None],
2,332448217490944000,,,,,lamotrigine,@stilgarg i'm ok ty have an official diagnosis...,,,i am ok ty have an official diagnosis of bipo...,"[i, am, ok, ty, have, an, official, diagnosis,...",[None],
3,332977955754110976,,,,,cymbalta,i'm soo depressed cymbalta couldn't help me .,,,i am soo depressed cymbalta could not help me .,"[i, am, soo, depressed, cymbalta, could, not, ...",[None],
4,333674203331051520,,,,,seroquel,time for my daily afternoon relaxation ritual ...,,,time for my daily afternoon relaxation ritual ...,"[time, for, my, daily, afternoon, relaxation, ...",[None],


In [60]:
# labeling the twitter, the label for begining of adverse effects is "B", other part of ADR is 'I' and the rest of twitter are 'O'
def sequence_check(test):
    for i in range(1,len(test)-1):
        if test[i-1:i+2] == ['O', 'I', 'O']:
            test[i] = 'O'
    return test
def sequence_label(sequence,extraction):
    labels = []
    for i in sequence:
        if i in extraction:
            if i == extraction[0]:
                labels.append("B")
            else:
                labels.append("I")
        else:
            labels.append('O')
    sequence_check(labels)
    return labels


for i in range(0, len(labeled_dataset)):
    labeled_dataset_t['labeled_sequence'][i] = sequence_label(labeled_dataset_t['tokennized_cleaned_tweet'][i],labeled_dataset_t['extraction_token'][i])

labeled_dataset_t.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,cleaned_tweet,tokennized_cleaned_tweet,extraction_token,labeled_sequence
0,331187619096588288,,,,,ofloxacin,@seefisch:oral drugs for pyelonephritis:ciprof...,,,oral drugs for pyelonephritisciprofloxacin lev...,"[oral, drugs, for, pyelonephritisciprofloxacin...",[None],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,332227554956161024,,,,,trazodone,happy for wellbutrin; has similar effects as a...,,,happy for wellbutrin has similar effects as ad...,"[happy, for, wellbutrin, has, similar, effects...",[None],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,332448217490944000,,,,,lamotrigine,@stilgarg i'm ok ty have an official diagnosis...,,,i am ok ty have an official diagnosis of bipo...,"[i, am, ok, ty, have, an, official, diagnosis,...",[None],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,332977955754110976,,,,,cymbalta,i'm soo depressed cymbalta couldn't help me .,,,i am soo depressed cymbalta could not help me .,"[i, am, soo, depressed, cymbalta, could, not, ...",[None],"[O, O, O, O, O, O, O, O, O, O]"
4,333674203331051520,,,,,seroquel,time for my daily afternoon relaxation ritual ...,,,time for my daily afternoon relaxation ritual ...,"[time, for, my, daily, afternoon, relaxation, ...",[None],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [13]:
with open("data/Training_data.txt", "a") as f:
    for i in range(0,labeled_dataset_t.shape[0]):
        for k in range(0,len(labeled_dataset_t['tokennized_cleaned_tweet'][i])):
            line = labeled_dataset_t['tokennized_cleaned_tweet'][i][k] + "\t"+ labeled_dataset_t['labeled_sequence'][i][k]
            print(line,file=f)
        print('\n',file=f)
    

In [62]:
print(labeled_dataset_t.shape)
print(sum(labeled_dataset_t['meddra_code']=="None"))

(2246, 13)
782


## Validation file

In [14]:
validation_file = pd.read_csv('data/task3_validation.tsv',delimiter='\t')

In [15]:
validation_file.shape

(560, 9)

In [16]:
validation_file_2 = validation_file.copy()
validation_file_2.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term
0,332317478170546176,28.0,37.0,ADR,allergies,avelox,"do you have any medication allergies? ""asthma!...",10013661.0,drug allergy
1,347806215776116737,31.0,46.0,ADR,HURT YOUR Liver,avelox,"@ashleylvivian if #avelox has hurt your liver,...",10024668.0,liver damage
2,350336129817509888,48.0,50.0,ADR,AD,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003731.0,attention deficit disorder
3,350336129817509888,88.0,93.0,ADR,focus,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003738.0,attention impaired
4,332540699692130304,11.0,15.0,ADR,died,cipro,pt of mine died from cipro rt @ciproispoison: ...,10011906.0,death


In [17]:
# change all the text to lowercase
validation_file_2['cleaned_tweet'] = validation_file_2['tweet'].apply(str.lower)

# remove @ mentions (remove the first '@' and following username)
validation_file_2['cleaned_tweet'] = validation_file_2['cleaned_tweet'].replace(to_replace=r'@([_A-Za-z]+[A-Za-z0-9-_]+)', value='', regex=True)

# remove all other non-standard characters (emoji, #, @....) ( For this task I thought '@'' and '#' are useless )
validation_file_2['cleaned_tweet'] = validation_file_2['cleaned_tweet'].replace(to_replace=r"""([^A-Za-z0-9 .,\-'"\/?!\\()=])+""", value='', regex=True)

# remove urls
validation_file_2['cleaned_tweet'] = validation_file_2['cleaned_tweet'].replace(to_replace=r'http\S+', value='', regex=True)

# Expand contractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
validation_file_2['cleaned_tweet'] = validation_file_2['cleaned_tweet'].apply(decontracted)

validation_file_2.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,cleaned_tweet
0,332317478170546176,28.0,37.0,ADR,allergies,avelox,"do you have any medication allergies? ""asthma!...",10013661.0,drug allergy,"do you have any medication allergies? ""asthma!..."
1,347806215776116737,31.0,46.0,ADR,HURT YOUR Liver,avelox,"@ashleylvivian if #avelox has hurt your liver,...",10024668.0,liver damage,"if avelox has hurt your liver, avoid tylenol ..."
2,350336129817509888,48.0,50.0,ADR,AD,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003731.0,attention deficit disorder,"apparently, baclofen greatly exacerbates the ""..."
3,350336129817509888,88.0,93.0,ADR,focus,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003738.0,attention impaired,"apparently, baclofen greatly exacerbates the ""..."
4,332540699692130304,11.0,15.0,ADR,died,cipro,pt of mine died from cipro rt @ciproispoison: ...,10011906.0,death,pt of mine died from cipro rt if only more d...


In [18]:
# As reported by the papers, different tokenizers may have different performance so I tried different tokenizers,
# their performance should be decided by the classification results

# Trial 1, used the nltk.word_tokenizer
# import nltk
# from nltk.stem import WordNetLemmatizer

# word_lemmatizer = WordNetLemmatizer()
# def tokenize_lemmatize(twitter):
#     tokens = nltk.word_tokenize(twitter)
#     tokens_lemmetized = []
#     for token in tokens:
#         tokens_lemmetized.append(word_lemmatizer.lemmatize(token))
#     lemmed_twitter = ' '.join(tokens_lemmetized)
#     return lemmed_twitter

# validation_tokenization_1 = validation_file_2
# validation_tokenization_1['cleaned_tweet'] = validation_file_2['cleaned_tweet'].apply(tokenize_lemmatize)

# validation_tokenization_1['cleaned_tweet']

### Code for processing the validation file

In [19]:
import nltk
validation_tokenization_2 = validation_file_2.copy()
tokenize = nltk.word_tokenize
validation_tokenization_2['tokennized_cleaned_tweet'] = validation_tokenization_2['cleaned_tweet'].apply(tokenize)
validation_tokenization_2.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,cleaned_tweet,tokennized_cleaned_tweet
0,332317478170546176,28.0,37.0,ADR,allergies,avelox,"do you have any medication allergies? ""asthma!...",10013661.0,drug allergy,"do you have any medication allergies? ""asthma!...","[do, you, have, any, medication, allergies, ?,..."
1,347806215776116737,31.0,46.0,ADR,HURT YOUR Liver,avelox,"@ashleylvivian if #avelox has hurt your liver,...",10024668.0,liver damage,"if avelox has hurt your liver, avoid tylenol ...","[if, avelox, has, hurt, your, liver, ,, avoid,..."
2,350336129817509888,48.0,50.0,ADR,AD,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003731.0,attention deficit disorder,"apparently, baclofen greatly exacerbates the ""...","[apparently, ,, baclofen, greatly, exacerbates..."
3,350336129817509888,88.0,93.0,ADR,focus,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003738.0,attention impaired,"apparently, baclofen greatly exacerbates the ""...","[apparently, ,, baclofen, greatly, exacerbates..."
4,332540699692130304,11.0,15.0,ADR,died,cipro,pt of mine died from cipro rt @ciproispoison: ...,10011906.0,death,pt of mine died from cipro rt if only more d...,"[pt, of, mine, died, from, cipro, rt, if, only..."


In [20]:
#Fill the NAN with None 
validation_tokenization_2.fillna('None',inplace=True)

# tokenize the already extracted label(AEs)
validation_tokenization_2['extraction_token'] = validation_tokenization_2['extraction'].apply(tokenize)
labeled_dataset = validation_tokenization_2

# Create a empty column 'label' for sequence labeling 
labeled_dataset['labeled_sequence'] =''
labeled_dataset.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,cleaned_tweet,tokennized_cleaned_tweet,extraction_token,labeled_sequence
0,332317478170546176,28,37,ADR,allergies,avelox,"do you have any medication allergies? ""asthma!...",10013700.0,drug allergy,"do you have any medication allergies? ""asthma!...","[do, you, have, any, medication, allergies, ?,...",[allergies],
1,347806215776116737,31,46,ADR,HURT YOUR Liver,avelox,"@ashleylvivian if #avelox has hurt your liver,...",10024700.0,liver damage,"if avelox has hurt your liver, avoid tylenol ...","[if, avelox, has, hurt, your, liver, ,, avoid,...","[HURT, YOUR, Liver]",
2,350336129817509888,48,50,ADR,AD,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003700.0,attention deficit disorder,"apparently, baclofen greatly exacerbates the ""...","[apparently, ,, baclofen, greatly, exacerbates...",[AD],
3,350336129817509888,88,93,ADR,focus,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003700.0,attention impaired,"apparently, baclofen greatly exacerbates the ""...","[apparently, ,, baclofen, greatly, exacerbates...",[focus],
4,332540699692130304,11,15,ADR,died,cipro,pt of mine died from cipro rt @ciproispoison: ...,10011900.0,death,pt of mine died from cipro rt if only more d...,"[pt, of, mine, died, from, cipro, rt, if, only...",[died],


In [21]:
# labeling the twitter, the label for beginign of adverse effects is "B", other part of ADR is 'I' and the rest of twitter are 'O'
def sequence_check(test):
    for i in range(1,len(test)-1):
        if test[i-1:i+2] == ['O', 'I', 'O']:
            test[i] = 'O'
    return test
def sequence_label(sequence,extraction):
    labels = []
    for i in sequence:
        if i in extraction:
            if i == extraction[0]:
                labels.append("B")
            else:
                labels.append("I")
        else:
            labels.append('O')
    sequence_check(labels)
    return labels


for i in range(0, len(labeled_dataset)):
    labeled_dataset['labeled_sequence'][i] = sequence_label(labeled_dataset['tokennized_cleaned_tweet'][i],labeled_dataset['extraction_token'][i])

labeled_dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,cleaned_tweet,tokennized_cleaned_tweet,extraction_token,labeled_sequence
0,332317478170546176,28,37,ADR,allergies,avelox,"do you have any medication allergies? ""asthma!...",10013700.0,drug allergy,"do you have any medication allergies? ""asthma!...","[do, you, have, any, medication, allergies, ?,...",[allergies],"[O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, ..."
1,347806215776116737,31,46,ADR,HURT YOUR Liver,avelox,"@ashleylvivian if #avelox has hurt your liver,...",10024700.0,liver damage,"if avelox has hurt your liver, avoid tylenol ...","[if, avelox, has, hurt, your, liver, ,, avoid,...","[HURT, YOUR, Liver]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,350336129817509888,48,50,ADR,AD,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003700.0,attention deficit disorder,"apparently, baclofen greatly exacerbates the ""...","[apparently, ,, baclofen, greatly, exacerbates...",[AD],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,350336129817509888,88,93,ADR,focus,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003700.0,attention impaired,"apparently, baclofen greatly exacerbates the ""...","[apparently, ,, baclofen, greatly, exacerbates...",[focus],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,332540699692130304,11,15,ADR,died,cipro,pt of mine died from cipro rt @ciproispoison: ...,10011900.0,death,pt of mine died from cipro rt if only more d...,"[pt, of, mine, died, from, cipro, rt, if, only...",[died],"[O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, ..."


In [53]:
print(labeled_dataset.shape)
print(sum(labeled_dataset['meddra_code']!="None"))

(560, 13)
365


In [22]:
with open("data/Validation_data.txt", "a") as f:
    for i in range(0,labeled_dataset.shape[0]):
        for k in range(0,len(labeled_dataset['tokennized_cleaned_tweet'][i])):
            line = labeled_dataset['tokennized_cleaned_tweet'][i][k] + "\t"+ labeled_dataset['labeled_sequence'][i][k]
            print(line,file=f)
        print('\n',file=f)

### Code for generating the the validation data

## NER

In [31]:
import torch
import warnings
warnings.simplefilter("ignore", UserWarning)
from typing import List
import sys
import datetime
import os
import random
import logging
import numpy as np
import copy
import torch
from torch.autograd import Variable
from torch import autograd
import torch.nn.functional as F
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, StackedEmbeddings
from flair.training_utils import store_embeddings

from sklearn.metrics import classification_report
from model import Tagger

logging.getLogger().setLevel(logging.INFO)

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [27]:
def load_data(train_file, dev_file, downsample_rate = None):
    corpus = None
    if downsample_rate is None:
        corpus = NLPTaskDataFetcher.load_column_corpus(
            './', columns, train_file=train_file, dev_file=dev_file
        )
    else:
        corpus = NLPTaskDataFetcher.load_column_corpus(
           './' , columns, train_file=train_file, dev_file=dev_file
        ).downsample(downsample_rate)
    tag_dict = corpus.make_tag_dictionary(tag_type='tag')
    print(corpus)
    return corpus.train, corpus.dev, tag_dict
            
            
def make_batch_feature(sentences):
    max_len = max([len(sentence) for sentence in sentences])
    lengths = [len(sentence.tokens) for sentence in sentences]
    sents_tensor = torch.zeros(len(sentences), max_len, embedding_dim)
    tag_list = []
    for s_id, s in enumerate(sentences):
        sents_tensor[s_id, :len(s)] = torch.cat([w.get_embedding().unsqueeze(0) for w in s], dim=0)
        tag_idx = [tag_dict.get_idx_for_item(t.get_tag('tag').value) for t in s]

        tag_idx_tensor = torch.LongTensor(tag_idx).to(device)
        tag_list.append(tag_idx_tensor)
        
    sents_tensor = sents_tensor.to(device)
    return sents_tensor, tag_list, lengths

def train(sentences, use_crf=True):
    optimizer.zero_grad()
    word_embedding.embed(sentences)
    feature_batch, tag_batch, lengths = make_batch_feature(sentences)
    
    output  = tagger(feature_batch, lengths)
    loss_batch = 0.0
    if use_crf:
        loss_batch = tagger.crf.neg_log_likelihood(output, tag_batch, lengths)
    else:
        for i in range(len(tag_batch)):
            sent_length = tag_batch[i].shape[0]
            sent_feature = output[i,:sent_length, :]
            tag_tensor = tag_batch[i]
            loss_batch += F.cross_entropy(sent_feature, tag_tensor)

    total_loss_batch = loss_batch
    cur_loss = total_loss_batch.item()
    store_embeddings(sentences, 'cpu')
    total_loss_batch.backward()
    torch.nn.utils.clip_grad_norm_(tagger.parameters(), 5.0)
    optimizer.step()
    return cur_loss
            
def evaluate(sentences, use_crf=True):
    all_confidences = []
    all_pred_tags = []
    all_gold_tags = []
    with torch.no_grad():
        batches = [sentences[x:x + batch_size] for x in range(0, len(sentences), batch_size)]
        for batch_no, sent_batch in enumerate(batches):
            word_embedding.embed(sent_batch)
            feature_batch, tag_batch, lengths = make_batch_feature(sent_batch)
            output = tagger(feature_batch, lengths)
            if not use_crf:
                softmax = F.softmax(output, dim=2)
                for i in range(len(lengths)):
                    sent_length = lengths[i]
                    tag_tensor = tag_batch[i]
                    pred_tag, confidences = [], []
                    for j in range(sent_length):
                        _, idx = torch.max(softmax[i, j, :], 0)
                        pred_tag.append(idx.item())
                        confidences.append(softmax[idx.item()])

                    all_pred_tags.append([tag_dict.get_item_for_index(ix) for ix in pred_tag])
                    all_confidences.append(confidences)
                    all_gold_tags.append([tag_dict.get_item_for_index(int(ix)) for ix in tag_tensor.tolist()])
            else:
                for i in range(len(lengths)):
                    sent_length = lengths[i]
                    sent_feature = output[i,:sent_length, :]
                    tag_tensor = tag_batch[i][:sent_length]
                    confidences, pred_tag = tagger.crf.viterbi_decode(sent_feature)
                    all_pred_tags.append([tag_dict.get_item_for_index(ix) for ix in pred_tag])
                    all_confidences.append(confidences)
                    all_gold_tags.append(
                        [tag_dict.get_item_for_index(ix) for ix in tag_tensor.tolist()]
                    )
            store_embeddings(sentences, 'none')
    return all_confidences, all_pred_tags, all_gold_tags

In [32]:
tagger = Tagger(
    in_dim=embedding_dim, relearned_dim=relearned_dim, hidden_dim=hidden_dim, 
    tag_dict=tag_dict, use_crf=use_crf,use_drop=0.0).to(device)
    
param_size = sum(p.numel() for p in tagger.parameters() if p.requires_grad)
logging.info('model size (trainable parameters): {}'.format(param_size))

INFO:root:model size (trainable parameters): 71542


In [33]:
train_file = 'data/Training_data.txt' # path to train data
dev_file = 'data/Validation_data.txt'  # path to validation data

model_output = 'sample-model.pt'

label_list = ['B' , 'I', 'O']

tag_type = 'tag'
columns = {0: 'text', 1: 'tag'}
train_data, dev_data, tag_dict = load_data(
    train_file=train_file, dev_file=dev_file,
    downsample_rate=None # using a downsample rate can run small sample data size for fast code testing!!!
)
train_data, dev_data = list(train_data), list(dev_data)

print(tag_dict.item2idx)
emb1 = WordEmbeddings('glove')  # create embedings you want use.

emb_types = [emb1] # if you have multiple types of embeddings, just add them here in the list.
word_embedding = StackedEmbeddings(embeddings=emb_types)


batch_size = 32
use_crf = True
embedding_dim = 100  # embedding dim need to match the embedding size you choose to use. 
relearned_dim = embedding_dim
hidden_dim=50
lr = 0.001
optimizer = torch.optim.SGD(tagger.parameters(), lr=lr)
training_epoch =10

2020-04-16 16:52:14,736 Reading data from .
2020-04-16 16:52:14,737 Train: data/Training_data.txt
2020-04-16 16:52:14,738 Dev: data/Validation_data.txt
2020-04-16 16:52:14,738 Test: None


  """
INFO:gensim.utils:loading Word2VecKeyedVectors object from /Users/jonathanmartindale/.flair/embeddings/glove.gensim


Corpus: 2021 train + 1120 dev + 225 test sentences
{b'<unk>': 0, b'O': 1, b'B': 2, b'I': 3, b'<START>': 4, b'<STOP>': 5}


INFO:gensim.utils:loading vectors from /Users/jonathanmartindale/.flair/embeddings/glove.gensim.vectors.npy with mmap=None
INFO:gensim.utils:setting ignored attribute vectors_norm to None
INFO:gensim.utils:loaded /Users/jonathanmartindale/.flair/embeddings/glove.gensim


## Start Training

In [34]:
try:
    for epoch in range(training_epoch):
        random.shuffle(train_data)
        batches = [train_data[x:x + batch_size] for x in range(0, len(train_data), batch_size)]
        tagger.train()
        current_loss, seen_sentences, modulo = 0.0, 0, max(1, int(len(batches) / 10))

        for batch_no, sent_batch in enumerate(batches):
            loss_batch = train(sent_batch, use_crf=use_crf)
            current_loss += (loss_batch)
            seen_sentences += len(sent_batch)
            if batch_no % modulo == 0:
                logging.info(
                    "epoch {0} - iter {1}/{2} - lr {3} - loss {4:.6f}".format(
                        epoch + 1, batch_no, len(batches), lr, current_loss / seen_sentences
                    )
                )
                iteration = epoch * len(batches) + batch_no
        current_loss /= len(train_data)

        tagger.eval()
        logging.info('----------------------- Evaluate DEV set -----------------------')
        scores, all_pred, all_true = evaluate(dev_data, use_crf=use_crf)
        
        all_pred_flat= [x for sent in all_pred for x in sent]
        all_true_flat= [x for sent in all_true for x in sent] 
        print(classification_report(y_pred=all_pred_flat, y_true=all_true_flat, labels=label_list))
        

    with torch.no_grad():
        torch.save(tagger.state_dict(), model_output)
        
except KeyboardInterrupt:
    print('key interepted (early stopping)')
    with torch.no_grad():
        torch.save(tagger.state_dict(), model_output)

INFO:root:epoch 1 - iter 0/64 - lr 0.001 - loss 2.296811
INFO:root:epoch 1 - iter 6/64 - lr 0.001 - loss 2.585885
INFO:root:epoch 1 - iter 12/64 - lr 0.001 - loss 2.612047
INFO:root:epoch 1 - iter 18/64 - lr 0.001 - loss 2.599777
INFO:root:epoch 1 - iter 24/64 - lr 0.001 - loss 2.558846
INFO:root:epoch 1 - iter 30/64 - lr 0.001 - loss 2.514242
INFO:root:epoch 1 - iter 36/64 - lr 0.001 - loss 2.473118
INFO:root:epoch 1 - iter 42/64 - lr 0.001 - loss 2.443766
INFO:root:epoch 1 - iter 48/64 - lr 0.001 - loss 2.413973
INFO:root:epoch 1 - iter 54/64 - lr 0.001 - loss 2.394863
INFO:root:epoch 1 - iter 60/64 - lr 0.001 - loss 2.385889
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 2 - iter 0/64 - lr 0.001 - loss 2.196755


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.03      0.47      0.05       700
           O       0.94      0.51      0.66     22866

    accuracy                           0.50     24232
   macro avg       0.33      0.33      0.24     24232
weighted avg       0.89      0.50      0.63     24232



INFO:root:epoch 2 - iter 6/64 - lr 0.001 - loss 2.189447
INFO:root:epoch 2 - iter 12/64 - lr 0.001 - loss 2.194093
INFO:root:epoch 2 - iter 18/64 - lr 0.001 - loss 2.178160
INFO:root:epoch 2 - iter 24/64 - lr 0.001 - loss 2.156801
INFO:root:epoch 2 - iter 30/64 - lr 0.001 - loss 2.152799
INFO:root:epoch 2 - iter 36/64 - lr 0.001 - loss 2.124750
INFO:root:epoch 2 - iter 42/64 - lr 0.001 - loss 2.093147
INFO:root:epoch 2 - iter 48/64 - lr 0.001 - loss 2.081180
INFO:root:epoch 2 - iter 54/64 - lr 0.001 - loss 2.067679
INFO:root:epoch 2 - iter 60/64 - lr 0.001 - loss 2.054255
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 3 - iter 0/64 - lr 0.001 - loss 1.904067


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.03      0.47      0.06       700
           O       0.94      0.51      0.66     22866

    accuracy                           0.50     24232
   macro avg       0.33      0.33      0.24     24232
weighted avg       0.89      0.50      0.63     24232



INFO:root:epoch 3 - iter 6/64 - lr 0.001 - loss 1.887170
INFO:root:epoch 3 - iter 12/64 - lr 0.001 - loss 1.810768
INFO:root:epoch 3 - iter 18/64 - lr 0.001 - loss 1.796153
INFO:root:epoch 3 - iter 24/64 - lr 0.001 - loss 1.807050
INFO:root:epoch 3 - iter 30/64 - lr 0.001 - loss 1.793567
INFO:root:epoch 3 - iter 36/64 - lr 0.001 - loss 1.779093
INFO:root:epoch 3 - iter 42/64 - lr 0.001 - loss 1.771151
INFO:root:epoch 3 - iter 48/64 - lr 0.001 - loss 1.761277
INFO:root:epoch 3 - iter 54/64 - lr 0.001 - loss 1.744906
INFO:root:epoch 3 - iter 60/64 - lr 0.001 - loss 1.721840
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 4 - iter 0/64 - lr 0.001 - loss 1.548888


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.03      0.47      0.05       700
           O       0.95      0.51      0.66     22866

    accuracy                           0.50     24232
   macro avg       0.33      0.33      0.24     24232
weighted avg       0.89      0.50      0.63     24232



INFO:root:epoch 4 - iter 6/64 - lr 0.001 - loss 1.516894
INFO:root:epoch 4 - iter 12/64 - lr 0.001 - loss 1.481447
INFO:root:epoch 4 - iter 18/64 - lr 0.001 - loss 1.475399
INFO:root:epoch 4 - iter 24/64 - lr 0.001 - loss 1.470049
INFO:root:epoch 4 - iter 30/64 - lr 0.001 - loss 1.447426
INFO:root:epoch 4 - iter 36/64 - lr 0.001 - loss 1.429865
INFO:root:epoch 4 - iter 42/64 - lr 0.001 - loss 1.420563
INFO:root:epoch 4 - iter 48/64 - lr 0.001 - loss 1.408418
INFO:root:epoch 4 - iter 54/64 - lr 0.001 - loss 1.386953
INFO:root:epoch 4 - iter 60/64 - lr 0.001 - loss 1.371205
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 5 - iter 0/64 - lr 0.001 - loss 1.153255


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.03      0.47      0.06       700
           O       0.95      0.52      0.67     22866

    accuracy                           0.51     24232
   macro avg       0.33      0.33      0.24     24232
weighted avg       0.90      0.51      0.64     24232



INFO:root:epoch 5 - iter 6/64 - lr 0.001 - loss 1.189467
INFO:root:epoch 5 - iter 12/64 - lr 0.001 - loss 1.142570
INFO:root:epoch 5 - iter 18/64 - lr 0.001 - loss 1.133131
INFO:root:epoch 5 - iter 24/64 - lr 0.001 - loss 1.118973
INFO:root:epoch 5 - iter 30/64 - lr 0.001 - loss 1.105829
INFO:root:epoch 5 - iter 36/64 - lr 0.001 - loss 1.085288
INFO:root:epoch 5 - iter 42/64 - lr 0.001 - loss 1.068281
INFO:root:epoch 5 - iter 48/64 - lr 0.001 - loss 1.050989
INFO:root:epoch 5 - iter 54/64 - lr 0.001 - loss 1.033107
INFO:root:epoch 5 - iter 60/64 - lr 0.001 - loss 1.011347
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 6 - iter 0/64 - lr 0.001 - loss 0.901514


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.03      0.47      0.06       700
           O       0.95      0.55      0.69     22866

    accuracy                           0.53     24232
   macro avg       0.33      0.34      0.25     24232
weighted avg       0.90      0.53      0.66     24232



INFO:root:epoch 6 - iter 6/64 - lr 0.001 - loss 0.791912
INFO:root:epoch 6 - iter 12/64 - lr 0.001 - loss 0.784135
INFO:root:epoch 6 - iter 18/64 - lr 0.001 - loss 0.778692
INFO:root:epoch 6 - iter 24/64 - lr 0.001 - loss 0.754296
INFO:root:epoch 6 - iter 30/64 - lr 0.001 - loss 0.733340
INFO:root:epoch 6 - iter 36/64 - lr 0.001 - loss 0.717377
INFO:root:epoch 6 - iter 42/64 - lr 0.001 - loss 0.700777
INFO:root:epoch 6 - iter 48/64 - lr 0.001 - loss 0.689825
INFO:root:epoch 6 - iter 54/64 - lr 0.001 - loss 0.678810
INFO:root:epoch 6 - iter 60/64 - lr 0.001 - loss 0.665486
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 7 - iter 0/64 - lr 0.001 - loss 0.435701


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.03      0.24      0.06       700
           O       0.95      0.79      0.87     22866

    accuracy                           0.76     24232
   macro avg       0.33      0.35      0.31     24232
weighted avg       0.90      0.76      0.82     24232



INFO:root:epoch 7 - iter 6/64 - lr 0.001 - loss 0.516353
INFO:root:epoch 7 - iter 12/64 - lr 0.001 - loss 0.484391
INFO:root:epoch 7 - iter 18/64 - lr 0.001 - loss 0.462261
INFO:root:epoch 7 - iter 24/64 - lr 0.001 - loss 0.454506
INFO:root:epoch 7 - iter 30/64 - lr 0.001 - loss 0.445299
INFO:root:epoch 7 - iter 36/64 - lr 0.001 - loss 0.437990
INFO:root:epoch 7 - iter 42/64 - lr 0.001 - loss 0.428016
INFO:root:epoch 7 - iter 48/64 - lr 0.001 - loss 0.418712
INFO:root:epoch 7 - iter 54/64 - lr 0.001 - loss 0.411626
INFO:root:epoch 7 - iter 60/64 - lr 0.001 - loss 0.405179
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 8 - iter 0/64 - lr 0.001 - loss 0.306551


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.02      0.04      0.02       700
           O       0.94      0.93      0.94     22866

    accuracy                           0.88     24232
   macro avg       0.32      0.32      0.32     24232
weighted avg       0.89      0.88      0.88     24232



INFO:root:epoch 8 - iter 6/64 - lr 0.001 - loss 0.309499
INFO:root:epoch 8 - iter 12/64 - lr 0.001 - loss 0.310398
INFO:root:epoch 8 - iter 18/64 - lr 0.001 - loss 0.310386
INFO:root:epoch 8 - iter 24/64 - lr 0.001 - loss 0.300969
INFO:root:epoch 8 - iter 30/64 - lr 0.001 - loss 0.294472
INFO:root:epoch 8 - iter 36/64 - lr 0.001 - loss 0.300995
INFO:root:epoch 8 - iter 42/64 - lr 0.001 - loss 0.294698
INFO:root:epoch 8 - iter 48/64 - lr 0.001 - loss 0.289950
INFO:root:epoch 8 - iter 54/64 - lr 0.001 - loss 0.282846
INFO:root:epoch 8 - iter 60/64 - lr 0.001 - loss 0.280123
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 9 - iter 0/64 - lr 0.001 - loss 0.273413


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.00      0.00      0.00       700
           O       0.94      0.99      0.96     22866

    accuracy                           0.93     24232
   macro avg       0.31      0.33      0.32     24232
weighted avg       0.89      0.93      0.91     24232



INFO:root:epoch 9 - iter 6/64 - lr 0.001 - loss 0.250730
INFO:root:epoch 9 - iter 12/64 - lr 0.001 - loss 0.241038
INFO:root:epoch 9 - iter 18/64 - lr 0.001 - loss 0.257493
INFO:root:epoch 9 - iter 24/64 - lr 0.001 - loss 0.255746
INFO:root:epoch 9 - iter 30/64 - lr 0.001 - loss 0.257718
INFO:root:epoch 9 - iter 36/64 - lr 0.001 - loss 0.253844
INFO:root:epoch 9 - iter 42/64 - lr 0.001 - loss 0.252046
INFO:root:epoch 9 - iter 48/64 - lr 0.001 - loss 0.249835
INFO:root:epoch 9 - iter 54/64 - lr 0.001 - loss 0.249897
INFO:root:epoch 9 - iter 60/64 - lr 0.001 - loss 0.249378
INFO:root:----------------------- Evaluate DEV set -----------------------
INFO:root:epoch 10 - iter 0/64 - lr 0.001 - loss 0.184081


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.00      0.00      0.00       700
           O       0.94      0.99      0.97     22866

    accuracy                           0.94     24232
   macro avg       0.31      0.33      0.32     24232
weighted avg       0.89      0.94      0.91     24232



INFO:root:epoch 10 - iter 6/64 - lr 0.001 - loss 0.227476
INFO:root:epoch 10 - iter 12/64 - lr 0.001 - loss 0.230748
INFO:root:epoch 10 - iter 18/64 - lr 0.001 - loss 0.241513
INFO:root:epoch 10 - iter 24/64 - lr 0.001 - loss 0.250231
INFO:root:epoch 10 - iter 30/64 - lr 0.001 - loss 0.244762
INFO:root:epoch 10 - iter 36/64 - lr 0.001 - loss 0.241352
INFO:root:epoch 10 - iter 42/64 - lr 0.001 - loss 0.242827
INFO:root:epoch 10 - iter 48/64 - lr 0.001 - loss 0.239149
INFO:root:epoch 10 - iter 54/64 - lr 0.001 - loss 0.239691
INFO:root:epoch 10 - iter 60/64 - lr 0.001 - loss 0.239906
INFO:root:----------------------- Evaluate DEV set -----------------------


              precision    recall  f1-score   support

           B       0.00      0.00      0.00       666
           I       0.00      0.00      0.00       700
           O       0.94      1.00      0.97     22866

    accuracy                           0.94     24232
   macro avg       0.31      0.33      0.32     24232
weighted avg       0.89      0.94      0.91     24232



## Evaluate

In [35]:
tagger.eval()

Tagger(
  (embedding2nn): Linear(in_features=100, out_features=100, bias=True)
  (rnn): LSTM(100, 50, batch_first=True, bidirectional=True)
  (ffn): Linear(in_features=100, out_features=6, bias=True)
  (crf): CRF()
)

In [72]:
example_list_to_eval = dev_data[:560]

In [73]:
scores, all_pred, all_true = evaluate(example_list_to_eval, use_crf=use_crf)

In [69]:
print(dev_data[0])
print(dev_data[560])

Sentence: "do you have any medication allergies ? `` asthma ! ! ! '' me `` ... ... .. '' pt `` no wait . avelox , that is it ! '' `` so no other allergies ? '' `` right ! '' cont" - 43 Tokens
Sentence: "do you have any medication allergies ? `` asthma ! ! ! '' me `` ... ... .. '' pt `` no wait . avelox , that is it ! '' `` so no other allergies ? '' `` right ! '' cont" - 43 Tokens


In [78]:
all_pred_arry = np.asarray(all_pred)


In [79]:
labeled_dataset['predicted_labels'] = all_pred_arry

# for i in range(0, len(labeled_dataset)):
#     labeled_dataset['predicted_labels'][i] = all_pred[i]

labeled_dataset.head()

Unnamed: 0,tweet_id,begin,end,type,extraction,drug,tweet,meddra_code,meddra_term,cleaned_tweet,tokennized_cleaned_tweet,extraction_token,labeled_sequence,predicted_labels
0,332317478170546176,28,37,ADR,allergies,avelox,"do you have any medication allergies? ""asthma!...",10013700.0,drug allergy,"do you have any medication allergies? ""asthma!...","[do, you, have, any, medication, allergies, ?,...",[allergies],"[O, O, O, O, O, B, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,347806215776116737,31,46,ADR,HURT YOUR Liver,avelox,"@ashleylvivian if #avelox has hurt your liver,...",10024700.0,liver damage,"if avelox has hurt your liver, avoid tylenol ...","[if, avelox, has, hurt, your, liver, ,, avoid,...","[HURT, YOUR, Liver]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,350336129817509888,48,50,ADR,AD,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003700.0,attention deficit disorder,"apparently, baclofen greatly exacerbates the ""...","[apparently, ,, baclofen, greatly, exacerbates...",[AD],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,350336129817509888,88,93,ADR,focus,baclofen,"apparently, baclofen greatly exacerbates the ""...",10003700.0,attention impaired,"apparently, baclofen greatly exacerbates the ""...","[apparently, ,, baclofen, greatly, exacerbates...",[focus],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,332540699692130304,11,15,ADR,died,cipro,pt of mine died from cipro rt @ciproispoison: ...,10011900.0,death,pt of mine died from cipro rt if only more d...,"[pt, of, mine, died, from, cipro, rt, if, only...",[died],"[O, O, O, B, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [80]:
labeled_dataset.to_csv('data/validation_file_with_predictions.csv')