In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, Doc2Vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

#### Load pre-trained word vectors from a binary file located at the specified path. The file contains word vectors in a format compatible with Word2Vec. Only the first 100,000 word vectors are loaded.

In [15]:
model_path = './biowordvec/BioWordVec_PubMed_MIMICIII_d200.vec.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True, limit=100000)

In [16]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
dev_data = pd.read_csv('data/dev.csv')

data = pd.concat([train_data, test_data, dev_data], ignore_index=True)

  train_data = pd.read_csv('data/train.csv')


In [17]:
# Selected columns to be retained and rename them
selected_columns = ['id', 'context', 'annotations/0/events/0/event_type', 
                    'annotations/0/events/0/Trigger/text/0/0', 
                    'annotations/0/events/0/Treatment/Drug/text/0/0','annotations/0/events/0/Effect/text/0/0']
new_names = ['id', 'context', 'event_type', 'trigger_text', 'drug', 'drug_effect']
df = data[selected_columns].rename(columns=dict(zip(selected_columns, new_names)))

In [18]:
sentiment_mapping = {'Adverse_event': 1, 'Potential_therapeutic_event': 0}
df['sentiment'] = df['event_type'].map(sentiment_mapping)

#### Used instead of TF-IDF as its a more prevalent technique in sentiment analysis

In [19]:
def average_word_embeddings(df, column, word_embeddings):
    embeddings = []
    for document in df[column]:
        for word in document.split():
            if word in word_embeddings:
                embeddings.append(word_embeddings[word])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros_like(word_embeddings.vector_size)

In [20]:
df['average_embeddings'] = df.apply(lambda row: average_word_embeddings(row, 'context', model), axis=1)

In [21]:
df.to_csv('phamacovigil_output_data.csv', index=False)

In [22]:
df.sample(10)

Unnamed: 0,id,context,event_type,trigger_text,drug,drug_effect,sentiment,average_embeddings
904,3628148_1,Rupture of a cerebral aneurysm associated with...,Adverse_event,associated,nifedipine,Rupture of a cerebral aneurysm,1,"[0.24257979, 0.15962923, 0.12351615, 0.0553195..."
4422,17763133_1,Cyclosporine is a potent inhibitor of simvasta...,Adverse_event,facilitate,Cyclosporine,,1,"[0.26850358, 0.2159426, 0.16674991, -0.0132749..."
4276,18472517_2,We present here a female patient who developed...,Adverse_event,developed,enalaprilat,acute bilateral parotitis,1,"[0.2470092, 0.18956128, 0.12259149, 0.06029636..."
170,2738729_2,Withdrawal emergent syndrome in an infant asso...,Adverse_event,associated,haloperidol,Withdrawal emergent syndrome,1,"[0.20956247, 0.18645857, 0.11164258, 0.0011130..."
742,7900744_1,Case report: mannitol nephrotoxicity syndrome:...,Adverse_event,syndrome,mannitol,nephrotoxicity,1,"[0.23208107, 0.16797915, 0.19282238, 0.0063195..."
3674,16620273_2,Severe acidosis in patients taking metformin--...,Adverse_event,taking,metformin,Severe acidosis,1,"[0.2167551, 0.1612138, 0.13051808, -0.01099174..."
3956,3718111_1,He became hyperkalemic on rechallenge with tim...,Adverse_event,following,timolol,hyperkalemic,1,"[0.23711173, 0.12776645, 0.120562576, -0.01220..."
2130,15482394_1,An objective causality assessment indicated a ...,Adverse_event,relationship,warfarin,clotting abnormality,1,"[0.2654402, 0.18026303, 0.14470221, 0.01915568..."
4316,18160579_1,Pericardial hemorrhage due to acetylsalicylic ...,Adverse_event,due,acetylsalicylic acid,Pericardial hemorrhage,1,"[0.27963316, 0.15577544, 0.108165085, 0.000977..."
3663,2719905_1,Long lasting respiratory depression induced by...,Adverse_event,induced,morphine-6-glucuronide,Long lasting respiratory depression,1,"[0.21859325, 0.080682576, 0.16367914, -0.02550..."


#### Fit a MultinomialNB & Print the Classification Report

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('sentiment', axis=1),
                                                    df['sentiment'], test_size=0.2, random_state=42)

scaler = MinMaxScaler(feature_range=(0, 1))

X_train_embeddings = np.array(X_train['average_embeddings'].tolist())
X_test_embeddings = np.array(X_test['average_embeddings'].tolist())

X_train_scaled = scaler.fit_transform(X_train_embeddings)
X_test_scaled = scaler.transform(X_test_embeddings)

nb_model = MultinomialNB()
nb_model.fit(X_train_scaled, y_train)

y_pred = nb_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Accuracy: 0.9078674948240165
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        89
           1       0.91      1.00      0.95       877

    accuracy                           0.91       966
   macro avg       0.45      0.50      0.48       966
weighted avg       0.82      0.91      0.86       966



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
lines = report.split('\n')
data = [line.split() for line in lines[2:-5]]
columns = ['class', 'precision', 'recall', 'f1-score', 'support']
df_report = pd.DataFrame(data, columns=columns)
df_report

Unnamed: 0,class,precision,recall,f1-score,support
0,0,0.0,0.0,0.0,89
1,1,0.91,1.0,0.95,877


### To be updated

In the classification report DataFrame, the class labels are represented as `0` and `1`. According to your statement, Class `0` corresponds to PTE (Potential Therapeutic Event), and Class `1` corresponds to ADE (Adverse Event). 

Let's interpret the results for each class:

- For Class `0` (PTE):
  - Precision: The precision for PTE is `0.00`. This indicates that among the instances predicted as PTE, none of them are true positives.
  - Recall: The recall for PTE is `0.00`. This means that none of the actual PTE instances are correctly predicted as PTE.
  - F1-score: The F1-score for PTE is `0.00`. It is the harmonic mean of precision and recall, which is also low in this case.
  - Support: The support for PTE is `89`. This represents the number of instances in the test data that belong to the PTE class.

- For Class `1` (ADE):
  - Precision: The precision for ADE is `0.91`. This indicates that among the instances predicted as ADE, 91% of them are true positives.
  - Recall: The recall for ADE is `1.00`. This means that all the actual ADE instances are correctly predicted as ADE.
  - F1-score: The F1-score for ADE is `0.95`. It is a measure of the model's accuracy in identifying ADE instances, combining precision and recall into a single metric.
  - Support: The support for ADE is `877`. This represents the number of instances in the test data that belong to the ADE class.

Based on these results, it seems that the model performs well in predicting ADE instances (Class `1`), achieving high precision, recall, and F1-score. However, it fails to correctly identify PTE instances (Class `0`), resulting in low performance metrics.

In [None]:
%%time
X_train_embeddings = np.array(X_train['average_embeddings'].tolist())
X_test_embeddings = np.array(X_test['average_embeddings'].tolist())

X_train_embeddings = np.abs(X_train_embeddings)
X_test_embeddings = np.abs(X_test_embeddings)

nmf = NMF(init='nndsvda', max_iter=1000)
lda = LatentDirichletAllocation()
nb_model = MultinomialNB()

pipeline = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0, 1))),
    ('abs_transform', FunctionTransformer(np.abs)),
    ('nmf', nmf),
    ('lda', lda),
    ('nb_model', nb_model)
])

param_grid = {
    'lda__n_components': [5, 10, 15],
    'nb_model__alpha': [0.1, 1.0, 10.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)

grid_search.fit(X_train_embeddings, y_train)

y_pred = grid_search.predict(X_test_embeddings)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

