In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy import sparse

import numpy as np
import pandas as pd
import random

In [None]:
train_data = pd.read_csv('data/train.csv')

In [None]:
train_data['sentiment'] = train_data['event_type'].map(sentiment_mapping)

In [None]:
selected_columns = ['id', 'context', 'annotations/0/events/0/event_type', 
                    'annotations/0/events/0/Trigger/text/0/0', 
                    'annotations/0/events/0/Treatment/Drug/text/0/0','annotations/0/events/0/Effect/text/0/0']
new_names = ['id', 'context', 'event_type', 'trigger_text', 'drug', 'drug_effect']
df = train_data[selected_columns].rename(columns=dict(zip(selected_columns, new_names)))

In [None]:
sentiment_mapping = {'Adverse_event': 1, 'Potential_therapeutic_event': 0}
train_data['sentiment'] = train_data['event_type'].map(sentiment_mapping)

In [None]:
X_train = train_data.drop('sentiment', axis=1)
y_train = train_data['sentiment']

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, 
                                   min_df=100,
                                   stop_words='english')

In [None]:
tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [None]:
tfidf = tfidf.toarray()
l, _ = tfidf.shape

X_train, X_test = np.split(tfidf, [int(l*(split_percentage/100))])

X_train = sparse.csr_matrix(X_train)
X_test = sparse.csr_matrix(X_test)

print(X_train.shape)
print(X_test.shape)

In [None]:
df = pd.DataFrame.sparse.from_spmatrix(X_train)
df.columns = tfidf_vectorizer.get_feature_names()
df

In [None]:
topics = 2
model = LatentDirichletAllocation(n_components=topics)

In [None]:
model.fit(X_train)

In [None]:
tf_feature_names = tfidf_vectorizer.get_feature_names()

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
print_top_words(model, tf_feature_names, 10)

### Test

In [None]:
test_sample = 1

In [None]:
p = model.transform(X_test[test_sample])
print(p)