In [1]:
import pandas as pd
df = pd.read_csv('Report Dataset.csv', encoding='latin-1')
df.sample(10)

Unnamed: 0,techreport,conclusion,Abnormal
4654,This EEG was done on a cooperative patient. \n...,Diffuse slow background 6-7 Hz consistent with...,1.0
5619,This EEG was done on a drowsy patient but stil...,The EEG showed frequent biposterior slow waves...,
70,"During wakefulness, the EEG showed regular alp...",The EEG is within normal limits.\n(11/02/2003),0.0
11439,This EEG was done on a 37 years old rather coo...,1. (R) temporal spikes suggestive of focal sei...,
4512,This EEG was done on a confused patient. \n\nG...,The EEG showed diffuse asynchronous theta and ...,1.0
9647,The EEG was done on a 23 year old mentally cha...,Abnormal EEG :\n1. Absent of background alpha....,
2784,This EEG was done on a drowsy patient. In wake...,Moderately abnormal\n1) Background is 7-8 Hz (...,1.0
8531,The EEG was done on a 7 months year old restle...,The background rhythm consists of 5-6 Hz waves...,
1206,The EEG was done on a drowsy patient who was a...,The EEG showed diffuse slow waves of theta ran...,1.0
13306,This EEG was done on a 26 years old cooperativ...,Abnormal EEG with two focal seizures presented...,


In [2]:
df = df.drop(columns=['techreport', 'Abnormal'])
df.dropna(inplace=True)
df.sample(10)

Unnamed: 0,conclusion
8073,The EEG is within normal limits.
11626,There is no evidence of sharp or spikes seen t...
9079,EEG within normal limits.
7165,Abnormal EEG. There are spike wave discharges ...
13196,Abnormal EEG.\nDiffusely slow background 3-4Hz...
3449,This EEG is within normal limits.
221,The EEG shows normal waking and sleeping backg...
9003,Multifocal seizure disorder on the background ...
4702,EEG is normal.\n(26/10/2007)
12007,Abnormal awake and sleep EEG showing stereotyp...


In [23]:
# import library
import string
import re
import nltk
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import spacy
import gensim.corpora as corpora
import gensim.models as models
from gensim.models.coherencemodel import CoherenceModel
from ast import literal_eval
from pprint import pprint
import pickle
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import gensim
import numpy as np

In [4]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stopwords = nltk.corpus.stopwords.words('english')

In [5]:
def lemmatization(sent, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    doc = nlp(" ".join(sent))
    texts_out = [token.lemma_ for token in doc if token.pos_ in allowed_postags]
    return texts_out

In [6]:
class AntonymReplacer(object):

    def replace(self, word, pos=None):
        
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
    
    def replace_negations(self, sent):

        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue
            
            words.append(word)
            i += 1
        
        return words

In [7]:
def preprocess(text):

    # Generate list of words (remove hastags and punctuations)
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)

    # Clean number
    text = re.sub(r'[0-9]', '', text)

    # Lower the text
    text = text.lower()

    # Remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    # Tokenize text
    text = word_tokenize(text)

    # Remove empty token
    text = [t for t in text if len(t) > 0]

    # Remove non-alphabetical token
    text = [t for t in text if t.isalpha()]

    # Replace negation token
    replacer = AntonymReplacer()
    text = replacer.replace_negations(text)

    # Remove stopwords
    text = [i for i in text if i not in stopwords]

    # Lemmatize text
    text = lemmatization(text, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    return text

In [8]:
%%time

cleaned_text = []

for text in df["conclusion"]:
    cleaned_text.append(preprocess(str(text)))

df["Clean"] = cleaned_text
df = df.dropna()

print('Preprocessing Done.')

Preprocessing Done.
CPU times: total: 26.4 s
Wall time: 27.4 s


In [9]:
df.to_csv("preprocessed_data.csv")

In [10]:
df = pd.read_csv("preprocessed_data.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [11]:
df.head(10)

Unnamed: 0,conclusion,Clean
0,The EEG showed evidence of diffuse cortical dy...,"['show', 'evidence', 'diffuse', 'cortical', 'd..."
1,The EEG showed a persistent (R ) frontal slow ...,"['show', 'persistent', 'r', 'frontal', 'slow',..."
2,The EEG revealed on attenuated (L) hemispheric...,"['reveal', 'attenuate', 'hemispheric', 'activi..."
3,"Diffuse slow (7-8Hz) background, consistent wi...","['diffuse', 'slow', 'background', 'consistent'..."
4,The EEG showed normal waking and sleep backgro...,"['show', 'normal', 'waking', 'sleep', 'backgro..."
5,1. Focal slowing both frontal and (R) temporal...,"['focal', 'slow', 'frontal', 'r', 'temporalpar..."
6,Abnormal EEG showing intermittent runs of gene...,"['abnormal', 'eeg', 'show', 'intermittent', 'r..."
7,MSLT showed 2 epochs of reduced sleep latency ...,"['show', 'epoch', 'reduce', 'sleep', 'latency'..."
8,This EEG was abnormal with diffuse low amplitu...,"['abnormal', 'diffuse', 'low', 'amplitude', 's..."
9,The EEG showed biposterior alpha activities of...,"['show', 'biposterior', 'alpha', 'activity', '..."


#### LDA Model Training

In [12]:
data_words = []
for x in df["Clean"]:
    data_words.append(literal_eval(x))

In [13]:
%%time

# Creat dictionary
id2word = corpora.Dictionary(data_words)
# Create corpus
texts = data_words
# Term document frequency
corpus = [id2word.doc2bow(text) for text in texts]

# number of topics
num_topics = 15
#num_words = 20
num_words = 20

# Build LDA model
lda_model = models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)

CPU times: total: 859 ms
Wall time: 11.4 s


In [14]:
topics = lda_model.print_topics(num_topics=num_topics, num_words=num_words)
topic_list = []

for topic in topics:
    topic_list.append(topic[1])

df_topics = pd.DataFrame(topic_list, columns = ['topics'])

In [15]:
df_topics.to_csv("LDA topics.csv")
pickle.dump(lda_model, open('LDA_model.model', 'wb'))#save lda model
pickle.dump(id2word, open('LDA_model.dict', 'wb'))#save dictionary
pickle.dump(corpus, open('LDA_model.corpus', 'wb'))#save corpus

In [16]:
pprint(lda_model.print_topics()) # print top topics 
doc_lda = lda_model[corpus] # display proportions of topics

[(0,
  '0.056*"wave" + 0.050*"eeg" + 0.049*"slow" + 0.037*"normal" + '
  '0.031*"background" + 0.031*"diffuse" + 0.025*"consistent" + 0.019*"sharp" + '
  '0.018*"discharge" + 0.018*"dysfunction"'),
 (1,
  '0.096*"normal" + 0.095*"eeg" + 0.059*"limit" + 0.034*"wave" + 0.027*"slow" '
  '+ 0.020*"show" + 0.018*"consistent" + 0.017*"activity" + 0.014*"sleep" + '
  '0.013*"epileptiform"'),
 (2,
  '0.133*"eeg" + 0.098*"normal" + 0.039*"sleep" + 0.036*"limit" + 0.029*"wave" '
  '+ 0.028*"awake" + 0.026*"slow" + 0.022*"abnormal" + 0.018*"diffuse" + '
  '0.016*"background"'),
 (3,
  '0.041*"discharge" + 0.037*"eeg" + 0.030*"see" + 0.029*"epileptiform" + '
  '0.029*"sleep" + 0.027*"normal" + 0.022*"background" + 0.020*"wave" + '
  '0.019*"slow" + 0.016*"consistent"'),
 (4,
  '0.051*"wave" + 0.044*"slow" + 0.044*"eeg" + 0.039*"abnormal" + '
  '0.037*"diffuse" + 0.032*"dysfunction" + 0.032*"cortical" + 0.031*"focal" + '
  '0.031*"r" + 0.025*"region"'),
 (5,
  '0.055*"eeg" + 0.050*"wave" + 0.032*"s

In [17]:
lda_model_file = 'LDA_model.model'
loaded_lda_model = pickle.load(open(lda_model_file, 'rb')) #load trained model
loaded_dictionary = pickle.load(open('LDA_model.dict', 'rb')) #load dictionary 
loaded_corpus = pickle.load(open('LDA_model.corpus', 'rb')) # load corp

In [18]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(loaded_lda_model, loaded_corpus, loaded_dictionary, n_jobs=1)

In [19]:
vis

In [20]:
# Set up coherence model
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')

# Calculate and print coherence
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score:', coherence_lda)

Coherence Score: 0.5063731071387734


In [25]:
def extract_topic_words(lda_model, num_topics):
    topic_words = []
    for i in range(num_topics):
        topic_words.append([word for word, _ in lda_model.show_topic(i, topn=20)])
    return topic_words

# Get the topic words
topic_words = extract_topic_words(lda_model, num_topics)