In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, Doc2Vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
%run common_functions.ipynb

In [None]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

dataframes = [train_data, test_data]
df = process_load_data(dataframes)

In [None]:
df.sample(5)

#### Load pre-trained word vectors from a binary file located at the specified path. The file contains word vectors in a format compatible with Word2Vec. Only the first 100,000 word vectors are loaded ( due to memory constraints )

In [None]:
model_path = './biowordvec/BioWordVec_PubMed_MIMICIII_d200.vec.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True, limit=100000)

In [None]:
def average_word_embeddings(df, column, word_embeddings):
    embeddings = []
    for document in df[column]:
        for word in document.split():
            if word in word_embeddings:
                embeddings.append(word_embeddings[word])
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros_like(word_embeddings.vector_size)

In [None]:
df['average_embeddings'] = df.apply(lambda row: average_word_embeddings(row, 'context', model), axis=1)

In [None]:
df.sample(5)

## Using MultinomialNB on a un-balanced dataset.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('sentiment', axis=1),
                                                    df['sentiment'], test_size=0.2, random_state=42)

scaler = MinMaxScaler(feature_range=(0, 1))

X_train_embeddings = np.array(X_train['average_embeddings'].tolist())
X_test_embeddings = np.array(X_test['average_embeddings'].tolist())

X_train_scaled = scaler.fit_transform(X_train_embeddings)
X_test_scaled = scaler.transform(X_test_embeddings)

pipeline = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0, 1))),
    ('classifier', MultinomialNB())
])

param_grid = {
    'classifier__alpha': [0.1, 1.0, 10.0],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train_embeddings, y_train)

best_unbalanced_model = grid_search.best_estimator_

cv_scores = cross_val_score(best_unbalanced_model, X_train_scaled, y_train, cv=5)
print("CV scores:", cv_scores)
print("Mean CV score:", cv_scores.mean())

y_pred = best_unbalanced_model.predict(X_test_scaled)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## Let's now predict using MultinomialNB - Before Balancing!!!

In [None]:
y_prob = best_unbalanced_model.predict_proba(X_test_scaled)[:,1]
evaluate_model(y_test, y_pred, y_prob)

In [None]:
correct_predictions = sum(y_test == y_pred)
total_predictions = len(y_test)
accuracy_percentage = (correct_predictions / total_predictions) * 100
print("Accuracy percentage (for unbalanced):", accuracy_percentage)

## Conclusion for MultinomialNB on a un-balanced dataset.

**Cross-Validation Scores:**

The scores range from around 0.909 to 0.911, with a mean of approximately 0.910. This implies that, on average, the model is correctly predicting the sentiment of about 91% of the text in the dataset, when tested on 5 different splits.

While generally a good performance, in a healthcare practice where making a wrong prediction could have serious consequences, we might need an even higher accuracy.


**Confusion Matrix:**

- The top left cell (1) represents "True Positives" - these are the instances that were positive (PTE class) and were correctly identified as positive by the model.

- The top right cell (69) represents "False Negatives" - these are the instances that were positive (PTE class), but were incorrectly identified as negative (ADE class) by the model.

- The bottom left cell (7) represents "False Positives" - these are the instances that were negative (ADE class), but were incorrectly identified as positive (PTE class) by the model.

- The bottom right cell (697) represents "True Negatives" - these are the instances that were negative (ADE class) and were correctly identified as negative by the model.

From these numbers, it is clear that the model excels at identifying the negative class (ADE), with 697 correct predictions and only 7 incorrect ones. However, the model struggles with the positive class (PTE), correctly identifying only 1 instance and incorrectly classifying 69. These results can be explained because of the dataset has not been balanced and has a lot of negative signal due to a disprotionately high number of ADE sentiments. 

This suggests that the model could be improved by collecting more data for the positive class (PTE), or by using techniques to better handle imbalanced data if the PTE class is underrepresented in your dataset.

The model currently excels at identifying negative sentiments (ADE class) but is having some difficulty correctly identifying positive sentiments (PTE class). The model's ability to correctly identify positive sentiments might potentially be improved by collecting more data about positive sentiments or by adjusting the model to better handle class imbalances. Despite this, the model is still correctly identifying the sentiment about 91% of the time across different tests, which is a strong result.

## Address class imbalance to improve MultinomialNB performance.

In [None]:
X = np.array(df['average_embeddings'].tolist())
y = df['sentiment']

print("Before oversampling: ", Counter(y))

oversample = SMOTE()
X_over, y_over = oversample.fit_resample(X, y)

print("After oversampling: ", Counter(y_over))

X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_over = scaler.fit_transform(X_train_over)
X_test_over = scaler.transform(X_test_over)

# And retrain model (from above ) on the balanced dataset
grid_search.fit(X_train_over, y_train_over)

best_balanced_model = grid_search.best_estimator_

cv_scores = cross_val_score(best_balanced_model, X_over, y_over, cv=5)
print("CV scores (balanced):", cv_scores)
print("Mean CV score (balanced):", np.mean(cv_scores))

# Generate predictions on the test set
y_pred_over = best_balanced_model.predict(X_test_over)

conf_mat = confusion_matrix(y_test_over, y_pred_over)
print("Confusion Matrix:")
print(conf_mat)


In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(conf_mat, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## Summary Notes for MultinomialNB Classfier for unbalanced & balanced datasets:

Here's a comparison table of the results before and after applying the class balancing technique (SMOTE):

In [None]:
data = {
    'Before SMOTE': [349, 3517, [0.9095, 0.9095, 0.9093, 0.9093, 0.9110], 0.9097, [[1, 69], [7, 697]]],
    'After SMOTE': [3517, 3517, [0.6147, 0.6119, 0.6353, 0.5998, 0.6152], 0.6154, [[471, 214], [313, 409]]]
}

index = ['Minority class count', 'Majority class count', 'CV scores', 'Mean CV score', 'Confusion Matrix']

summary_report = pd.DataFrame(data, index=index)
summary_report

Before class balancing, the model showed a high mean CV score (**0.9097**), but the confusion matrix suggested that it wasn't performing well on the minority class (only 1 true positive instance).

After applying SMOTE to balance the classes, the count of the minority class increased to match the majority class (from 349 to 3517). However, the mean cross-validation score decreased (from 0.9097 to 0.6154), which indicates that overall model performance was reduced. But the model's ability to predict the minority class improved significantly, as shown by the increased count of true positives in the confusion matrix (from 1 to 471).

**Implication:**

Before we addressed the class imbalance, the model was heavily biased towards the majority class (ADE), leading to a high overall accuracy, but it struggled to identify the minority class (PTE).

After we balanced the classes , the model's overall accuracy decreased. However, its ability to correctly identify instances of the minority class (PTE) improved substantially. This suggests that the trade-off resulted in a model that is more useful in practice, as it is now more capable of identifying both PTE and ADE events, which was the goal of this analysis.

It's important to note that the objective in pharmacovigilance sentiment analysis is not merely to maximize accuracy, but also to effectively identify both positive and negative events. After balancing the classes, the model is better equipped to do this, despite the decrease in overall accuracy.


***NOTE: I have used only train + test datasets. I will be using the hold out data dev to see how the model is working on unseen data***

## Let's now predict using MultinomialNB - After Balancing!!!

In [None]:
dev_data = pd.read_csv('data/dev.csv')
dataframes = [dev_data]
dev_df = process_load_data(dataframes)
dev_df['average_embeddings'] = dev_df.apply(lambda row: average_word_embeddings(row, 'context', model), axis=1)

In [None]:
dev_df.sample(5)

In [None]:
dev_embeddings = np.array(dev_df['average_embeddings'].tolist())
dev_scaled = scaler.transform(dev_embeddings)

dev_pred = best_balanced_model.predict(dev_scaled)
comparison = pd.DataFrame({'Actual': dev_df['sentiment'], 'Predicted': dev_pred})

# Determine the number of matches
matches = comparison[comparison['Actual'] == comparison['Predicted']].shape[0]

# Compute the percentage of correct predictions
accuracy_percentage = (matches / comparison.shape[0]) * 100

print(f"Out of {comparison.shape[0]} entries in the dev set, {matches} were correctly predicted by the model.")
print(f"Accuracy percentage: {accuracy_percentage:.2f}%")

In [None]:
y_prob = best_balanced_model.predict_proba(dev_scaled)[:,1]
evaluate_model(dev_df['sentiment'], dev_pred, y_prob)

# SVM

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(df.drop('sentiment', axis=1),
                                                    df['sentiment'], test_size=0.2, random_state=42)

scaler = MinMaxScaler(feature_range=(0, 1))

X_train_embeddings = np.array(X_train['average_embeddings'].tolist())
X_test_embeddings = np.array(X_test['average_embeddings'].tolist())

X_train_scaled = scaler.fit_transform(X_train_embeddings)
X_test_scaled = scaler.transform(X_test_embeddings)

pipeline = Pipeline([
    ('scaler', MinMaxScaler(feature_range=(0, 1))),
    ('classifier', SVC())
])

param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],  # example parameter grid for SVC
    'classifier__kernel': ['linear', 'rbf'],  # you can customize this to the kernels you want to tune
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:")
print(report)