In [None]:
#!pip install tensorflow==1.14.0

In [None]:
!pip -qqq install git+https://www.github.com/keras-team/keras-contrib.git
#!pip -qqq install --upgrade keras

  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone


In [None]:
!pip install keras==2.2.4



In [None]:
!pip install sklearn_crfsuite



In [None]:
import pandas as pd
import numpy as np

import os
import re
import string
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from google.colab import files

import warnings
warnings.filterwarnings("ignore")

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import LSTM, Dense, TimeDistributed, Embedding, Bidirectional
from keras.layers import Dropout, Flatten, RepeatVector, Activation, Permute, merge, Lambda
from keras.models import Model, Input
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier #KerasRegressor
from keras.preprocessing.text import text_to_word_sequence
#import keras

from tqdm import tqdm
import scipy.stats
from collections import Counter
from sklearn.utils import class_weight
from sklearn.metrics import make_scorer, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold, cross_val_score

from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score, flat_precision_score, flat_recall_score



#np.random.seed(1337) # for reproducibility
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss

In [None]:
from numpy.random import seed
seed(1)
np.random.seed(0)

In [None]:
def get_approx_metrics(df, test_phrases, model, X_test): 

  df['I_ADR'] = df['I_ADR'].astype(str)
  df['I_Drug'] = df['I_Drug'].astype(str)
  df['I_ADR_split'] = list(map(lambda x: x.split(','), df['I_ADR']))
  df['I_Drug_split'] = list(map(lambda x: x.split(','), df['I_Drug']))

  count_correctly_predicted_IADR = 0 #ADRspans correctly identified
  count_actual_IADR = 0 #actual ADR spans
  count_predicted_IADR = 0 #ADRspans predicted


  for i in range(len(test_phrases)): #len(test_phrases)

    #Find all rows in df_check_final 
    if len(df[df['sentence']==test_phrases[i]]) == 0:
      #if df_final[df_final['sentence'].str.contains(test_phrases[i][0:60])]:
        print('To check special characters : ', test_phrases[i])
      #else:
        #df_n = df_final[df_final['sentence'].str.contains(test_phrases[i][0:60])].reset_index()

    else:
      df_n = df[df['sentence']==test_phrases[i]].reset_index()

      # Evaluation
      y_pred = model.predict(np.array([X_test[i]]))
      y_pred = np.argmax(y_pred, axis=-1)
      # Convert the index to tag
      y_pred = [[idx2tag[i] for i in row] for row in y_pred]
      #Remouve padding 
      y_pred = list(filter(lambda x: x!= 'PAD', y_pred[0]))
      #print(y_pred)

      df_n['prediction'] = y_pred

      #Actuel I_ADR
      n_actual_IADR = len(df_n.loc[1, 'I_ADR_split']) #len(df_n[df_n['tag']=='I_ADR'])
      count_actual_IADR += n_actual_IADR
      
      #Predicted I_ADR
      indices_row_predited_ADR = [i for i, x in enumerate(df_n['prediction'].to_list()) if x == "I_ADR"]
      #print(indices_row_predited_ADR)
      if indices_row_predited_ADR:
        check_continuity = ([(indices_row_predited_ADR[i+1] - indices_row_predited_ADR[i])
                            for i in range(len(indices_row_predited_ADR)-1)])
        #print(check_continuity)
        #if check_continuity:
        count_predicted_IADR += len([i for i in check_continuity if i > 1]) + 1 
        #print(count_predicted_IADR)
      
      for j in range(len(df_n.loc[1, 'I_ADR_split'])):

        groupe_words = df_n.loc[1, 'I_ADR_split'][j]
        indexes = df_n.loc[1, 'new_index_IADR'][2*j:2*(j+1)]

        df_groupe_words = df_n[(df_n['index']>=indexes[0]) & (df_n['index']<=indexes[1])]
        n_pred = len(df_groupe_words[df_groupe_words['prediction']=='I_ADR'])
        if n_pred > 0:
          count_correctly_predicted_IADR += 1 #n_pred #n_predited_IADR

  approximate_match_precision = count_correctly_predicted_IADR/count_predicted_IADR
  approximate_match_recall = count_correctly_predicted_IADR/count_actual_IADR
  approximate_match_F1score = (2 * approximate_match_precision * approximate_match_recall /
                              (approximate_match_precision + approximate_match_recall))
  
  return approximate_match_precision, approximate_match_recall, approximate_match_F1score


In [None]:
with open('ADE_POS_to_NER_withPOS_TAG.pickle', 'rb') as handle:
    df = pickle.load(handle)

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,  t) for w,  t in zip(s['Word'].values.tolist(),                                                            
                                                           s['tag'].values.tolist())]
        self.grouped = self.data.groupby('sentence').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [None]:
#Displaying one full sentence
getter = SentenceGetter(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
#sentence with its tag.
sentences = getter.sentences

words = list(set(df["Word"].values))
words.append("ENDPAD")
n_words = len(words)
print('Total unique words: ', n_words)

max_len = max([len(s) for s in sentences])
print ('Maximum sequence length:', max_len)

tags = list(set(df["tag"].values))
print('Tags are : ', tags)
num_tag = df['tag'].nunique()
print('Number of tags : ', num_tag)

Total unique words:  8671
Maximum sequence length: 94
Tags are :  ['I_ADR', 'O', 'I_Drug']
Number of tags :  3


In [None]:
#Getting unique words and labels from data
words = list(df['Word'].unique())
tags = list(df['tag'].unique())
# Dictionary word:index pair
# word is key and its value is corresponding index
word_to_index = {w : i + 2 for i, w in enumerate(words)}
word_to_index["UNK"] = 1
word_to_index["PAD"] = 0

# Dictionary lable:index pair
# label is key and value is index.
tag_to_index = {t : i + 1 for i, t in enumerate(tags)}
tag_to_index["PAD"] = 0

idx2word = {i: w for w, i in word_to_index.items()}
idx2tag = {i: w for w, i in tag_to_index.items()}
idx2tag

{0: 'PAD', 1: 'O', 2: 'I_ADR', 3: 'I_Drug'}

In [None]:
def sentences_to_vectors(sentences, max_len=max_len, num_tag=num_tag):
  # Converting each sentence into list of index from list of tokens
  X = [[word_to_index[w[0]] for w in s] for s in sentences]
  # Padding each sequence to have same length  of each word
  X = pad_sequences(maxlen = max_len, sequences = X, padding = "post", value = word_to_index["PAD"])

  # Convert label to index
  y = [[tag_to_index[w[1]] for w in s] for s in sentences]

  # padding
  y = pad_sequences(maxlen = max_len, sequences = y, padding = "post", value = tag_to_index["PAD"])

  ### Compute class_weights
  n_class = [sum([Counter(y[i])[j] for i in range(len(y))]) for j in range(num_tag + 1)]
  class_weights = {max(n_class)/n_class[i] for i in range(len(n_class))}

  # One hot encoded labels
  y = [to_categorical(i, num_classes = num_tag + 1) for i in y]

  return X, y, n_class, class_weights

In [None]:
train_sentences, test_sentences = train_test_split(sentences, test_size=0.3, random_state=1)

X_test, y_test, n_class, class_weights = sentences_to_vectors(test_sentences)
X_train, y_train, n_class, class_weights = sentences_to_vectors(train_sentences)

In [None]:
class_weights

{1.0, 4.897698998128372, 25.371164594502112, 51.074397244546496}

In [None]:
batch_size = 16
epochs = 30
embedding = 40

In [None]:
#class_weights = {1, 2, 50, 200 }

def make_model_BiLSTM(num_tag, embedding=embedding, hidden_size=50, optimizer='rmsprop'):
    # Model architecture
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=len(words) + 2, output_dim=embedding, input_length=max_len)(input)
    model = Bidirectional(LSTM(units=hidden_size, return_sequences=True, recurrent_dropout=0.1))(model)
    #model = Dropout(0.3)(model)
    out   =  TimeDistributed(Dense(num_tag + 1, activation="softmax"))(model)

    model = Model(input, out)
    model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

    model.summary()
    return model

model_BiLSTM = make_model_BiLSTM(num_tag)

#Early Stopping
es = EarlyStopping(monitor='val_loss', mode='min', patience=3, verbose=1, restore_best_weights=True)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', 
                     save_weights_only=True, save_best_only=True, verbose=1)

In [None]:


history_1 = model_BiLSTM.fit(X_train, np.array(y_train), batch_size=batch_size, epochs=epochs,
                    validation_split=0.1, callbacks=[es, mc], class_weight=class_weights)

# Evaluation
y_pred = model_BiLSTM.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

# Convert the index to tag
y_pred = [[idx2tag[i] for i in row] for row in y_pred]
y_test_true = [[idx2tag[i] for i in row] for row in y_test_true]

labels = list(idx2tag.values())
labels.remove('PAD')
#labels.remove('O')
print(labels)

report = flat_classification_report(y_pred=y_pred, y_true=y_test_true, labels=labels, digits=4)
print(report)

approximate_match_precision, approximate_match_recall, approximate_match_F1score = get_approx_metrics(
    df, test_phrases, model_BiLSTM, X_test)

print('Approximatif precision : {:2.2%}'.format(approximate_match_precision))
print('Approximatif recall : {:2.2%}'.format(approximate_match_recall))
print('Approximatif F1-score : {:2.2%}'.format(approximate_match_F1score))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 94)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 94, 40)            346880    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 94, 100)           36400     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 94, 4)             404       
Total params: 383,684
Trainable params: 383,684
Non-trainable params: 0
_________________________________________________________________
Train on 2690 samples, validate on 299 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Restoring model weights from the end of the best epoch
Epoch 00010: early stopping
['O', 'I_ADR', 'I_Drug']
  

In [None]:
# At every execution model picks some random test sample from test set.
test_phrases = [" ".join([s[0] for s in sent]) for sent in test_sentences]

i = np.random.randint(0,X_test.shape[0]) # choose a random number between 0 and len(X_test)
p = model_BiLSTM.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_test[i], -1)

print("Sample number {} of {} (Test Set)".format(i, X_test.shape[0]))

# Visualization
print("{:18}||{:6}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_test[i], true, p[0]):
    if w != 0:
        print("{:18}: {:6} {}".format(words[w-2], idx2tag[t], idx2tag[pred]))

Sample number 910 of 1282 (Test Set)
Word              ||True  ||Pred
This              : O      O
patient           : O      O
rapidly           : O      O
progressed        : O      O
from              : O      O
mild              : I_ADR  I_ADR
neurotoxicity     : I_ADR  I_ADR
to                : O      I_ADR
fatal             : I_ADR  I_ADR
encephalopathy    : I_ADR  I_ADR
after             : O      O
one               : O      O
dose              : O      O
of                : O      O
intrathecal       : O      O
methotrexate      : I_Drug I_Drug
during            : O      O
his               : O      O
third             : O      O
cycle             : O      O
of                : O      O
chemotherapy      : O      O


To check special characters :  We conclude that neurosurgeons and neurologists should be aware of calcium antagonist - - related ileus in patients treated with nimodipine
To check special characters :  In eight patients a mean decrease in serum Na of 8.25 / - 3.2 mEq / L was observed after a single 200 mg intravenous dose of lorcainide
To check special characters :  Vancomycin is widely used against methicillin - resistant Staphylococcus aureus infections but it is associated with many adverse effects such as nephrotoxicity ototoxicity gastrointestinal disturbances blood disorders and two types of hypersensitivity reactions - an anaphylactoid reaction known as red man syndrome and anaphylaxis
To check special characters :  5 - Fluorouracil cardiotoxicity complicating treatment of stage IIB cervical cancer - - case report
To check special characters :  Carboplatin hypersensitivity presenting as coronary vasospasm - a case report
To check special characters :  We describe a patient who d

In [None]:
test_phrases = [" ".join([s[0] for s in sent]) for sent in test_sentences]
test_phrases[0]

df['I_ADR'] = df['I_ADR'].astype(str)
df['I_Drug'] = df['I_Drug'].astype(str)
df['I_ADR_split'] = list(map(lambda x: x.split(','), df['I_ADR']))
df['I_Drug_split'] = list(map(lambda x: x.split(','), df['I_Drug']))

count_correctly_predicted_IADR = 0 #ADRspans correctly identified
count_actual_IADR = 0 #actual ADR spans
count_predicted_IADR = 0 #ADRspans predicted


for i in range(len(test_phrases)): #len(test_phrases)

  #Find all rows in df_check_final 
  if len(df[df['sentence']==test_phrases[i]]) > 0:
    #if df_final[df_final['sentence'].str.contains(test_phrases[i][0:60])]:
    #print('To check special characters : ', test_phrases[i])
    #else:
      #df_n = df_final[df_final['sentence'].str.contains(test_phrases[i][0:60])].reset_index()

  #else:
    df_n = df[df['sentence']==test_phrases[i]].reset_index()

    # Evaluation
    y_pred = model_BiLSTM.predict(np.array([X_test[i]]))
    y_pred = np.argmax(y_pred, axis=-1)
    # Convert the index to tag
    y_pred = [[idx2tag[i] for i in row] for row in y_pred]
    #Remouve padding 
    y_pred = list(filter(lambda x: x!= 'PAD', y_pred[0]))
    #print(y_pred)

    df_n['prediction'] = y_pred

    #Actuel I_ADR
    n_actual_IADR = len(df_n.loc[1, 'I_ADR_split'])
    count_actual_IADR += n_actual_IADR
    
    #Predicted I_ADR
    indices_row_predited_ADR = [i for i, x in enumerate(df_n['prediction'].to_list()) if x == "I_ADR"]
    #print(indices_row_predited_ADR)
    if indices_row_predited_ADR:
      check_continuity = ([(indices_row_predited_ADR[i+1] - indices_row_predited_ADR[i])
                          for i in range(len(indices_row_predited_ADR)-1)])
      #print(check_continuity)
      #if check_continuity:
      count_predicted_IADR += len([i for i in check_continuity if i > 1]) + 1 
      #print(count_predicted_IADR)
    
    for j in range(len(df_n.loc[1, 'I_ADR_split'])):

      groupe_words = df_n.loc[1, 'I_ADR_split'][j]
      indexes = df_n.loc[1, 'new_index_IADR'][2*j:2*(j+1)]

      df_groupe_words = df_n[(df_n['index']>=indexes[0]) & (df_n['index']<=indexes[1])]
      n_pred = len(df_groupe_words[df_groupe_words['prediction']=='I_ADR'])
      if n_pred > 0:
        count_correctly_predicted_IADR += 1 #n_pred #n_predited_IADR

approximate_match_precision = count_correctly_predicted_IADR/count_predicted_IADR
approximate_match_recall = count_correctly_predicted_IADR/count_actual_IADR
approximate_match_F1score = (2 * approximate_match_precision * approximate_match_recall /
                             (approximate_match_precision + approximate_match_recall))

print('Number of ADR actual : ', count_actual_IADR)
print('Number of ADR predicted : ', count_predicted_IADR)
print('Number of ADR correctly identified  : ',count_correctly_predicted_IADR)
print('Approximatif precision : {:2.2%}'.format(approximate_match_precision))
print('Approximatif recall : {:2.2%}'.format(approximate_match_recall))
print('Approximatif F1-score : {:2.2%}'.format(approximate_match_F1score))


In [None]:
model_BiLSTM

<keras.engine.training.Model at 0x7faa29615fd0>

In [None]:
approximate_match_precision, approximate_match_recall, approximate_match_F1score = get_approx_metrics(
    df, test_phrases, model_BiLSTM, X_test)
print('Approximatif precision : {:2.2%}'.format(approximate_match_precision))
print('Approximatif recall : {:2.2%}'.format(approximate_match_recall))
print('Approximatif F1-score : {:2.2%}'.format(approximate_match_F1score))

To check special characters :  We conclude that neurosurgeons and neurologists should be aware of calcium antagonist - - related ileus in patients treated with nimodipine
To check special characters :  In eight patients a mean decrease in serum Na of 8.25 / - 3.2 mEq / L was observed after a single 200 mg intravenous dose of lorcainide
To check special characters :  Vancomycin is widely used against methicillin - resistant Staphylococcus aureus infections but it is associated with many adverse effects such as nephrotoxicity ototoxicity gastrointestinal disturbances blood disorders and two types of hypersensitivity reactions - an anaphylactoid reaction known as red man syndrome and anaphylaxis
To check special characters :  5 - Fluorouracil cardiotoxicity complicating treatment of stage IIB cervical cancer - - case report
To check special characters :  Carboplatin hypersensitivity presenting as coronary vasospasm - a case report
To check special characters :  We describe a patient who d

#Bi LSTM + CFR

In [None]:
def make_model_BiLSTM_CRF(num_tag, embedding=40, hidden_size=50, optimizer='rmsprop'):
    # Model architecture
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=len(words) + 2, output_dim=embedding, input_length=max_len, mask_zero=False)(input)
    model = Bidirectional(LSTM(units=hidden_size, return_sequences=True, recurrent_dropout=0.1))(model)
    model = TimeDistributed(Dense(hidden_size, activation="relu"))(model)
    crf = CRF(num_tag + 1)  # CRF layer
    out = crf(model)  # output

    model = Model(input, out)
    model.compile(optimizer=optimizer, loss=crf.loss_function, metrics=[crf.accuracy])

    model.summary()
    return model

#np.random.seed(0)
model_BiLSTM_CRF = make_model_BiLSTM_CRF(num_tag)

#Early Stopping
es = EarlyStopping(monitor='val_loss',mode='min', patience=3, verbose=1, restore_best_weights=True)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', 
                     save_weights_only=True, save_best_only=True, verbose=1)

history_2 = model_BiLSTM_CRF.fit(X_train, np.array(y_train), batch_size=batch_size, epochs=epochs,
                    validation_split=0.1, callbacks=[es, mc], class_weight=class_weights)

# Evaluation
y_pred = model_BiLSTM_CRF.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_true = np.argmax(y_test, -1)

# Convert the index to tag
y_pred = [[idx2tag[i] for i in row] for row in y_pred]
y_test_true = [[idx2tag[i] for i in row] for row in y_test_true]

labels = list(idx2tag.values())
labels.remove('PAD')
#labels.remove('O')
print(labels)

report = flat_classification_report(y_pred=y_pred, y_true=y_test_true, labels=labels, digits=4)
print(report)

approximate_match_precision, approximate_match_recall, approximate_match_F1score = get_approx_metrics(
    df, test_phrases, model_BiLSTM_CRF,X_test)
print('Approximatif precision : {:2.2%}'.format(approximate_match_precision))
print('Approximatif recall : {:2.2%}'.format(approximate_match_recall))
print('Approximatif F1-score : {:2.2%}'.format(approximate_match_F1score))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 94)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 94, 40)            346880    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 94, 100)           36400     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 94, 50)            5050      
_________________________________________________________________
crf_1 (CRF)                  (None, 94, 4)             228       
Total params: 388,558
Trainable params: 388,558
Non-trainable params: 0
_________________________________________________________________
Train on 2690 samples, validate on 299 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
E

TypeError: ignored

In [None]:
# At every execution model picks some random test sample from test set.
i = np.random.randint(0,X_test.shape[0]) # choose a random number between 0 and len(X_test)
p = model_BiLSTM_CRF.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_test[i], -1)

print("Sample number {} of {} (Test Set)".format(i, X_test.shape[0]))

# Visualization
print("{:18}||{:6}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_test[i], true, p[0]):
    if w != 0:
        print("{:18}: {:6} {}".format(words[w-2], idx2tag[t], idx2tag[pred]))

Sample number 153 of 1282 (Test Set)
Word              ||True  ||Pred
Hydroxyurea       : I_Drug I_Drug
associated        : O      O
with              : O      O
concomitant       : O      O
occurrence        : O      O
of                : O      O
diffuse           : I_ADR  I_ADR
longitudinal      : I_ADR  I_ADR
melanonychia      : I_ADR  I_ADR
and               : O      O
multiple          : I_ADR  I_ADR
squamous          : I_ADR  I_ADR
cell              : I_ADR  I_ADR
carcinomas        : I_ADR  I_ADR
in                : O      O
an                : O      O
elderly           : O      O
subject           : O      O


In [None]:
test_phrases = [" ".join([s[0] for s in sent]) for sent in test_sentences]
test_phrases[0]

df['I_ADR'] = df['I_ADR'].astype(str)
df['I_Drug'] = df['I_Drug'].astype(str)
df['I_ADR_split'] = list(map(lambda x: x.split(','), df['I_ADR']))
df['I_Drug_split'] = list(map(lambda x: x.split(','), df['I_Drug']))

count_correctly_predicted_IADR = 0 #ADRspans correctly identified
count_actual_IADR = 0 #actual ADR spans
count_predicted_IADR = 0 #ADRspans predicted


for i in range(len(test_phrases)): #len(test_phrases)

  #Find all rows in df_check_final 
  if len(df[df['sentence']==test_phrases[i]]) > 0:
    #if df_final[df_final['sentence'].str.contains(test_phrases[i][0:60])]:
    #print('To check special characters : ', test_phrases[i])
    #else:
      #df_n = df_final[df_final['sentence'].str.contains(test_phrases[i][0:60])].reset_index()

  #else:
    df_n = df[df['sentence']==test_phrases[i]].reset_index()

    # Evaluation
    y_pred = model_BiLSTM_CRF.predict(np.array([X_test[i]]))
    y_pred = np.argmax(y_pred, axis=-1)
    # Convert the index to tag
    y_pred = [[idx2tag[i] for i in row] for row in y_pred]
    #Remouve padding 
    y_pred = list(filter(lambda x: x!= 'PAD', y_pred[0]))
    #print(y_pred)

    df_n['prediction'] = y_pred

    #Actuel I_ADR
    n_actual_IADR = len(df_n.loc[1, 'I_ADR_split'])
    count_actual_IADR += n_actual_IADR
    
    #Predicted I_ADR
    indices_row_predited_ADR = [i for i, x in enumerate(df_n['prediction'].to_list()) if x == "I_ADR"]
    #print(indices_row_predited_ADR)
    if indices_row_predited_ADR:
      check_continuity = ([(indices_row_predited_ADR[i+1] - indices_row_predited_ADR[i])
                          for i in range(len(indices_row_predited_ADR)-1)])
      #print(check_continuity)
      #if check_continuity:
      count_predicted_IADR += len([i for i in check_continuity if i > 1]) + 1 
      #print(count_predicted_IADR)
    
    for j in range(len(df_n.loc[1, 'I_ADR_split'])):

      groupe_words = df_n.loc[1, 'I_ADR_split'][j]
      indexes = df_n.loc[1, 'new_index_IADR'][2*j:2*(j+1)]

      df_groupe_words = df_n[(df_n['index']>=indexes[0]) & (df_n['index']<=indexes[1])]
      n_pred = len(df_groupe_words[df_groupe_words['prediction']=='I_ADR'])
      if n_pred > 0:
        count_correctly_predicted_IADR += 1 #n_pred #n_predited_IADR

approximate_match_precision = count_correctly_predicted_IADR/count_predicted_IADR
approximate_match_recall = count_correctly_predicted_IADR/count_actual_IADR
approximate_match_F1score = (2 * approximate_match_precision * approximate_match_recall /
                             (approximate_match_precision + approximate_match_recall))

print('Number of ADR actual : ', count_actual_IADR)
print('Number of ADR predicted : ', count_predicted_IADR)
print('Number of ADR correctly identified  : ',count_correctly_predicted_IADR)
print('Approximatif precision : {:2.2%}'.format(approximate_match_precision))
print('Approximatif recall : {:2.2%}'.format(approximate_match_recall))
print('Approximatif F1-score : {:2.2%}'.format(approximate_match_F1score))


Number of ADR actual :  1686
Number of ADR predicted :  1517
Number of ADR correctly identified  :  1415
Approximatif precision : 93.28%
Approximatif recall : 83.93%
Approximatif F1-score : 88.35%


In [None]:
approximate_match_precision, approximate_match_recall, approximate_match_F1score = get_approx_metrics(
    df, test_phrases, model_BiLSTM_CRF, X_test)
print('Approximatif precision : {:2.2%}'.format(approximate_match_precision))
print('Approximatif recall : {:2.2%}'.format(approximate_match_recall))
print('Approximatif F1-score : {:2.2%}'.format(approximate_match_F1score))

To check special characters :  We conclude that neurosurgeons and neurologists should be aware of calcium antagonist - - related ileus in patients treated with nimodipine
To check special characters :  In eight patients a mean decrease in serum Na of 8.25 / - 3.2 mEq / L was observed after a single 200 mg intravenous dose of lorcainide
To check special characters :  Vancomycin is widely used against methicillin - resistant Staphylococcus aureus infections but it is associated with many adverse effects such as nephrotoxicity ototoxicity gastrointestinal disturbances blood disorders and two types of hypersensitivity reactions - an anaphylactoid reaction known as red man syndrome and anaphylaxis
To check special characters :  5 - Fluorouracil cardiotoxicity complicating treatment of stage IIB cervical cancer - - case report
To check special characters :  Carboplatin hypersensitivity presenting as coronary vasospasm - a case report
To check special characters :  We describe a patient who d

In [None]:
model.save('Model_BiLSTM_CFR_Emb40_Class_weight_87')