In [1]:
import pandas as pd
import numpy as np
import re
import string
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import re
import string
import spacy
import colorlover as cl
import random
import shap

from wordcloud import WordCloud
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from plotly.subplots import make_subplots
from spacy import displacy
from nltk.tokenize import TreebankWordTokenizer as twt
from nltk.tokenize import TweetTokenizer as twt
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, Doc2Vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

#### Load pre-trained word vectors from a binary file located at the specified path. The file contains word vectors in a format compatible with Word2Vec. Only the first 100,000 word vectors are loaded.

In [2]:
model_path = './biowordvec/BioWordVec_PubMed_MIMICIII_d200.vec.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True, limit=100000)

In [3]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
dev_data = pd.read_csv('data/dev.csv')

data = pd.concat([train_data, test_data, dev_data], ignore_index=True)

Columns (112,114,121,125,157,168,186,188,206,208,217,229,231,232,233,235,237,241,244,246,247,249,250,251,268,270,271,280,282,286,288,289,291,305,310,312,313,315,316,318,319,323,324,327,328,329,331,333,335,336,338,342,344,348,350,352,353,355,356,358,359,361) have mixed types. Specify dtype option on import or set low_memory=False.


In [4]:
# Selected columns to be retained and rename them
selected_columns = ['id', 'context', 'annotations/0/events/0/event_type', 
                    'annotations/0/events/0/Trigger/text/0/0', 
                    'annotations/0/events/0/Treatment/Drug/text/0/0','annotations/0/events/0/Effect/text/0/0']
new_names = ['id', 'context', 'event_type', 'trigger_text', 'drug', 'drug_effect']
df = data[selected_columns].rename(columns=dict(zip(selected_columns, new_names)))

In [5]:
sentiment_mapping = {'Adverse_event': 1, 'Potential_therapeutic_event': 0}
df['sentiment'] = df['event_type'].map(sentiment_mapping)

#### Used instead of TF-IDF as its a more prevalent technique in sentiment analysis

In [6]:
def average_word_embeddings(df, column, word_embeddings):
    embeddings = []
    for document in df[column]:
        for word in document.split():
            if word in word_embeddings:
                embeddings.append(word_embeddings[word])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros_like(word_embeddings.vector_size)

In [7]:
df['average_embeddings'] = df.apply(lambda row: average_word_embeddings(row, 'context', model), axis=1)

In [8]:
df.to_csv('phamacovigil_output_data.csv', index=False)

In [9]:
df.sample(10)

Unnamed: 0,id,context,event_type,trigger_text,drug,drug_effect,sentiment,average_embeddings
911,990658_1,A fatal case of pancytopenia due to levomeprom...,Adverse_event,due,levomepromazine,pancytopenia,1,"[0.22692822, 0.15035842, 0.21457776, 0.0808344..."
1871,17655376_1,"Pharmacokinetics of dapsone gel, 5% for the tr...",Potential_therapeutic_event,treatment,dapsone,,0,"[0.227062, 0.16987313, 0.15353839, 0.07344896,..."
3531,20925534_3,Proton-pump inhibitors (PPIs) are believed to ...,Potential_therapeutic_event,believed,Proton-pump inhibitors (PPIs),,0,"[0.25976455, 0.16897154, 0.135644, 0.030140094..."
1894,19104709_2,In contrast to chronic or subacute thyroiditis...,Potential_therapeutic_event,for,IFN-alpha,,0,"[0.24489142, 0.21867579, 0.107363515, 0.015361..."
2601,8641617_1,Although both patients recovered from the coli...,Potential_therapeutic_event,recovered,vancomycin,,0,"[0.28353158, 0.195971, 0.15358187, 0.07332772,..."
2644,19667003_2,PURPOSE: A case of carbamazepine-induced hyper...,Adverse_event,induced,carbamazepine,hyperammonemia,1,"[0.17367765, 0.03918988, 0.17666668, 0.0314042..."
3433,18837734_1,Rosaceiform eruption induced by erlotinib.,Adverse_event,induced,erlotinib,Rosaceiform eruption,1,"[0.26265162, 0.10211046, 0.18507987, 0.0087879..."
3369,16317298_1,Gemcitabine-related radiation recall in a pati...,Adverse_event,related,Gemcitabine,radiation recall,1,"[0.16886453, 0.20107986, 0.051484108, 0.004735..."
155,2327115_4,This article describes a patient with suspecte...,Adverse_event,induced,ciprofloxacin,interstitial nephritis,1,"[0.22166675, 0.14473964, 0.094159015, 0.005891..."
1742,9754850_2,Like other atypical neuroleptics olanzapine is...,Adverse_event,show,olanzapine,reduced prevalence of extrapyramidal side effects,1,"[0.21889381, 0.08909936, 0.14526007, 0.0442822..."


#### Fit a MultinomialNB & Print the Classification Report

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('sentiment', axis=1),
                                                    df['sentiment'], test_size=0.2, random_state=42)

scaler = MinMaxScaler(feature_range=(0, 1))

X_train_embeddings = np.array(X_train['average_embeddings'].tolist())
X_test_embeddings = np.array(X_test['average_embeddings'].tolist())

X_train_scaled = scaler.fit_transform(X_train_embeddings)
X_test_scaled = scaler.transform(X_test_embeddings)

nb_model = MultinomialNB()
nb_model.fit(X_train_scaled, y_train)

y_pred = nb_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Accuracy: 0.9078674948240165
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        89
           1       0.91      1.00      0.95       877

    accuracy                           0.91       966
   macro avg       0.45      0.50      0.48       966
weighted avg       0.82      0.91      0.86       966



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [11]:
lines = report.split('\n')
data = [line.split() for line in lines[2:-5]]
columns = ['class', 'precision', 'recall', 'f1-score', 'support']
df_report = pd.DataFrame(data, columns=columns)
df_report.to_dict()

{'class': {0: '0', 1: '1'},
 'precision': {0: '0.00', 1: '0.91'},
 'recall': {0: '0.00', 1: '1.00'},
 'f1-score': {0: '0.00', 1: '0.95'},
 'support': {0: '89', 1: '877'}}

#### Summary
- The model achieved a precision of 0.00 for the negative class (PTE), indicating that it did not correctly predict any instances of potential therapeutic events.
- For the positive class (ADE), the model achieved a precision of 0.91, indicating that 91% of the predicted adverse events were correctly classified.
- The model achieved a recall of 1.00 for the positive class, indicating that it correctly identified all instances of adverse events. However, it had a recall of 0.00 for the negative class, indicating that it failed to identify any potential therapeutic events.

### Test with Validation Data

In [12]:
dev_df = pd.read_csv('data/dev.csv')

In [13]:
selected_columns = ['id', 'context', 'annotations/0/events/0/event_type', 
                    'annotations/0/events/0/Trigger/text/0/0', 
                    'annotations/0/events/0/Treatment/Drug/text/0/0','annotations/0/events/0/Effect/text/0/0']
new_names = ['id', 'context', 'event_type', 'trigger_text', 'drug', 'drug_effect']
dev_df = dev_df[selected_columns].rename(columns=dict(zip(selected_columns, new_names)))

In [14]:
sentiment_mapping = {'Adverse_event': 1, 'Potential_therapeutic_event': 0}
dev_df['sentiment'] = dev_df['event_type'].map(sentiment_mapping)

In [15]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s-]', '', text)
    text = re.sub(r'[^a-zA-Z\s-]', '', text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [16]:
def predict_sentiment(text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    
    # Extract average word embeddings from the preprocessed text
    features = average_word_embeddings(preprocessed_text, 'context', model)
    scaled_features = scaler.transform(features.reshape(1, -1))
    
    # Make a prediction using the trained model
    prediction = nb_model.predict(scaled_features)
    
    # Map the predicted sentiment to the corresponding label
    sentiment = sentiment_mapping[prediction[0]]
    
    return sentiment

In [17]:
def average_word_embeddings_modified(text, word_embeddings):
    embeddings = []
    for word in text.split():
        if word in word_embeddings:
            word_embedding = word_embeddings[word]
            embeddings.append(word_embedding)
    if embeddings:
        average_embedding = np.mean(embeddings, axis=0)
    else:
        average_embedding = np.zeros_like(word_embeddings.vector_size)
    return average_embedding


In [18]:
def predict_sentiment_updated(text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    
    # Extract average word embeddings from the preprocessed text
    features = average_word_embeddings_modified(preprocessed_text, model)
    scaled_features = scaler.transform(features.reshape(1, -1))
    
    # Make a prediction using the trained model
    prediction = nb_model.predict(scaled_features)
    
    # Map the predicted sentiment to the corresponding label
    sentiment = "0-Potential Therapeutic Event" if prediction[0] == 0 else "1-Adverse Event"
    
    return sentiment

In [19]:
def test_my_model():
    random_number = random.randint(0, len(dev_df) - 1)
    random_row = dev_df.iloc[random_number]
    context = random_row['context']
    predicted_sentiment = predict_sentiment_updated(context)
    actual_sentiment = random_row['sentiment']

    result = pd.DataFrame({'Context': [context],
                           'Actual Sentiment': [actual_sentiment],
                           'Predicted Sentiment': [predicted_sentiment]})

    return result

In [20]:
results = []
for _ in range(10):
    result = test_my_model()
    results.append(result)

output_df = pd.concat(results, ignore_index=True)

output_df

Unnamed: 0,Context,Actual Sentiment,Predicted Sentiment
0,"To our knowledge, however, this is the first c...",1,1-Adverse Event
1,This severe illness was likely caused by minoc...,1,1-Adverse Event
2,"Aseptic meningitis, hemolytic anemia, hepatiti...",1,1-Adverse Event
3,Although this G-CSF-driven leucocytosis was al...,1,1-Adverse Event
4,The successful development and implementation ...,1,1-Adverse Event
5,A case of toxic hepatitis caused by combinatio...,1,1-Adverse Event
6,Artemether-lumefantrine (AL) is first-line tre...,0,1-Adverse Event
7,Azathioprine-induced myelosuppression due to t...,1,1-Adverse Event
8,We report a case of generalized cutaneous scle...,1,1-Adverse Event
9,Lipoid pneumonia: a silent complication of min...,1,1-Adverse Event


### Ensemble - LDA & NB

In [None]:
%%time
X_train_embeddings = np.array(X_train['average_embeddings'].tolist())
X_test_embeddings = np.array(X_test['average_embeddings'].tolist())

X_train_embeddings = np.abs(X_train_embeddings)
X_test_embeddings = np.abs(X_test_embeddings)

nmf = NMF(init='nndsvda')
lda = LatentDirichletAllocation()
nb_model = MultinomialNB()

pipeline = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0, 1))),
    ('abs_transform', FunctionTransformer(np.abs)),
    ('nmf', nmf),
    ('lda', lda),
    ('nb_model', nb_model)
])

param_grid = {
    'lda__n_components': [5, 10, 15],
    'nb_model__alpha': [0.1, 1.0, 10.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)

grid_search.fit(X_train_embeddings, y_train)

y_pred = grid_search.predict(X_test_embeddings)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)