In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy import sparse
from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd
import random

In [None]:
train_data = pd.read_csv('data/train.csv')

In [None]:
train_data['sentiment'] = train_data['event_type'].map(sentiment_mapping)

In [None]:
selected_columns = ['id', 'context', 'annotations/0/events/0/event_type', 
                    'annotations/0/events/0/Trigger/text/0/0', 
                    'annotations/0/events/0/Treatment/Drug/text/0/0','annotations/0/events/0/Effect/text/0/0']
new_names = ['id', 'context', 'event_type', 'trigger_text', 'drug', 'drug_effect']
df = train_data[selected_columns].rename(columns=dict(zip(selected_columns, new_names)))

In [None]:
sentiment_mapping = {'Adverse_event': 1, 'Potential_therapeutic_event': 0}
train_data['sentiment'] = train_data['event_type'].map(sentiment_mapping)

In [None]:
X_train = train_data.drop('sentiment', axis=1)
y_train = train_data['sentiment']

## tf–idf (Term Frequency–Inverse Document Frequency)

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'max_df': [0.9, 0.95, 0.99],
    'min_df': [50, 100, 150],
    'stop_words': [None, 'english']
}

In [None]:
tfidf_vectorizer = TfidfVectorizer()

grid_search = GridSearchCV(tfidf_vectorizer, param_grid, cv=5)

grid_search.fit(X_train['context_clean'], y_train)

tfidf = grid_search.best_estimator_.fit_transform(X_train['context_clean'])
tfidf = tfidf.toarray()

X_train = sparse.csr_matrix(X_train)

print("X_train shape:", X_train.shape)

## Term Frequency Matrix

In [None]:
df = pd.DataFrame.sparse.from_spmatrix(X_train)
df.columns = tfidf_vectorizer.get_feature_names()
df

## Model

In [None]:
topics = 2
lda_model = LatentDirichletAllocation(n_components=topics)

In [None]:
lda_model.fit(X_train)

## Topic Results

In [None]:
tf_feature_names = tfidf_vectorizer.get_feature_names()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
print_top_words(lda_model, tf_feature_names, 10)

## Test

In [None]:
test_sample = 1

In [None]:
p = lda_model.transform(X_test[test_sample])
print(p)

In [None]:
t = p.argmax()
print("Topic #{}".format(t))

In [None]:
print(X_test_document[test_sample])