In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Reading CSV Files
df = pd.read_csv("../input/medicalnotestask/ClinNotes.csv")
MedConcep_df = pd.read_csv("../input/medicalnotestask/MedicalConcepts.csv")

In [3]:
df.head()

In [4]:
MedConcep_df.head()

In [5]:
df.isnull().sum()

In [6]:
df.shape

In [7]:
df['category'].value_counts()

### Preprocessing

In [8]:
df.iloc[0]['notes']

In [9]:
import re

def remove_sub_heading(text):
    output_text=[]
    
    #Splitting text 
    for sentence in text.split(','):
        #Splitting sent to get each word
        for word in sentence.split():
            output = re.sub(r".+?:","", word.lower())    ##Remove Sub Heading with colon, Eg M-MODE: , DOPPLER:
            output = re.sub(r"\d\.$", "", output)        ##Remove Numbers with dot, Eg 1. , 2.
            output = re.sub(r"\d\d_mg", "", output)        ##Remove Numbers with _mg, Eg 10_mg , 20_mg
            if len(word.strip()) > 0 and len(output.strip()) > 0:
                output_text.append(output)
    return " ".join(output_text)

In [10]:
text = "2-D M-MODE: , ,1.  Left atrial enlargement with left atrial diameter of 4.7 cm.,2.  Normal size right and left ventricle.,3.  Normal LV systolic function with left ventricular ejection fraction of 51%.,4.  Normal LV diastolic function.,5.  No pericardial effusion.,6.  Normal morphology of aortic valve, mitral valve, tricuspid valve, and pulmonary valve.,7.  PA systolic pressure is 36 mmHg.,DOPPLER: , ,1.  Mild mitral and tricuspid regurgitation.,2.  Trace aortic and pulmonary regurgitation."
remove_sub_heading(text)

In [11]:
df['notes'] = df['notes'].apply(lambda x: remove_sub_heading(x))

### Remove Punctuations

In [12]:
import string
def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

In [13]:
remove_punct("veerappan_ramanathan !")

In [14]:
df['notes'] = df['notes'].apply(lambda x: remove_punct(x))

### Remove stopwords

In [76]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')

def remove_stopwords(text):
    filtered_words = []
    for word in text.split():
        if word not in stopwords.words('english'):
            filtered_words.append(WordNetLemmatizer().lemmatize(word,'v'))
    
    return " ".join(filtered_words)

In [77]:
remove_stopwords("writing code for this interview")

In [17]:
%%time
df['notes'] = df['notes'].apply(lambda x: remove_stopwords(x))

In [18]:
df.head()

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['target'] = le.fit_transform(df['category'])

In [20]:
df.head()

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['notes'].tolist(), df['target'], test_size=0.2, 
                                                    random_state=42, stratify=df['target'])

In [22]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
train_labels_one_hot = ohe.fit_transform(y_train.to_numpy().reshape(-1,1))
val_labels_one_hot = ohe.transform(y_test.to_numpy().reshape(-1,1))

In [79]:
val_labels_one_hot.shape

In [23]:
len(x_train), len(x_test)

### Creating Function for Getting Results

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def get_results(y_true, y_predicted):
    acc_scr = accuracy_score(y_true, y_predicted)
    f1_scr = f1_score(y_true, y_predicted, average='weighted')
    precision_scr = precision_score(y_true, y_predicted, average='weighted')
    recall_scr = recall_score(y_true, y_predicted, average='weighted')
    
    result = {}
    result["accuracy"] = acc_scr
    result["f1"] = f1_scr
    result["precision"] = precision_scr
    result["recall"] = recall_scr
    
    return result

### Method 1 using CountVectorizer

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Create our vectorizer
vectorizer = CountVectorizer()

# Get the training vectors
vectors = vectorizer.fit_transform(x_train)

# Build the model
model_0 = MultinomialNB(alpha=.01)

# Train the model
model_0.fit(vectors, y_train)

# Get the test vectors
vectors_test = vectorizer.transform(x_test)

# Predict
pred = model_0.predict(vectors_test)
results = get_results(y_test, pred)

print(f'Total accuracy score: {results["accuracy"]}')
print(f'Total F1 score: {results["f1"]}')
print(f'Total Precision score: {results["precision"]}')
print(f'Total Recall score: {results["recall"]}')

### Method 2 using Tfidf

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create our vectorizer
vectorizer = TfidfVectorizer()

# Get the training vectors
vectors = vectorizer.fit_transform(x_train)

# Build the model
model_1 = MultinomialNB(alpha=.01)

# Train the model
model_1.fit(vectors, y_train)

# Get the test vectors
vectors_test = vectorizer.transform(x_test)

# Predict
pred = model_1.predict(vectors_test)
results = get_results(y_test, pred)

print(f'Total accuracy score: {results["accuracy"]}')
print(f'Total F1 score: {results["f1"]}')
print(f'Total Precision score: {results["precision"]}')
print(f'Total Recall score: {results["recall"]}')

### Method 3 using Gensim

In [28]:
import gensim

In [29]:
sent = [row.split() for row in df['notes']]

In [30]:
w2v_model = gensim.models.Word2Vec(min_count=10,
                                   window=3,
                                   vector_size=100,
                                   negative=20,
                                   workers=7)

In [31]:
w2v_model.build_vocab(sent, progress_per=1000)

In [32]:
#w2v_model.train(sent, total_examples=len(sent), epochs=20)
w2v_model.train(sent, total_examples=2000, epochs=10)

In [33]:
w2v_model.wv.most_similar('plavix', topn=10)

In [34]:
w2v_model.wv.similarity('aspirin', 'insulin')

In [35]:
w2v_model.wv.similarity('plavix', 'lipitor')

In [36]:
w2v_model.wv.similarity('plavix', 'coumadin')

In [37]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['notes'])

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [38]:
%%time
x_train_pad = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=300, padding="post",truncating="post")
x_test_pad = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=300, padding="post",truncating="post")

In [39]:
x_train_pad[0]

In [40]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [41]:
embedding_layer = layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=300, trainable=True)

In [42]:
model_2 = Sequential()
model_2.add(embedding_layer)
model_2.add(layers.Dropout(0.5))
model_2.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_2.add(layers.Dense(3, activation='softmax'))

model_2.summary()

In [43]:
model_2.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
             optimizer=tf.keras.optimizers.Adam(),
             metrics=['accuracy'])

In [44]:
model_2.fit(x_train_pad, y_train, epochs=3, batch_size=16, validation_split=0.2)

### Method 4 using TextVectorization

In [45]:
sent_lens = [len(sent.split()) for sent in x_train]
avg_sent_lens = np.mean(sent_lens)
avg_sent_lens

In [46]:
import matplotlib.pyplot as plt
plt.hist(sent_lens, bins=7)

In [47]:
output_seq_len = int(np.percentile(sent_lens, 95))
output_seq_len

In [48]:
full_text = " ".join(x_train)
full_text_list = list(set(full_text.split()))
print(len(full_text_list))

In [49]:
max_tokens = 12000

In [50]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
text_vectorizer = TextVectorization(max_tokens=max_tokens,
                                    output_mode='int',
                                    output_sequence_length=output_seq_len)

In [51]:
text_vectorizer.adapt(x_train)

In [52]:
text_vocabulary = text_vectorizer.get_vocabulary()

In [53]:
token_embed = layers.Embedding(input_dim=len(text_vocabulary),
                              output_dim=128,
                              mask_zero=True,
                              name='token_embedding')

In [54]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, train_labels_one_hot))
valid_dataset = tf.data.Dataset.from_tensor_slices((x_test, val_labels_one_hot))

In [55]:
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [56]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vectors = text_vectorizer(inputs)
text_embeddings = token_embed(text_vectors)
x = layers.Conv1D(64, kernel_size=5, padding='same', activation='relu')(text_embeddings)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(3, activation='softmax')(x)

model_3 = tf.keras.Model(inputs, outputs)

model_3.compile(loss="categorical_crossentropy",
               optimizer=tf.keras.optimizers.Adam(),
               metrics=['accuracy'])

In [57]:
model_3.summary()

In [58]:
model_3.fit(train_dataset,
            steps_per_epoch = len(train_dataset),
            validation_data=valid_dataset,
            validation_steps=len(valid_dataset),
            epochs=5)

### Method 5 Using Universal Sentence Encoder

In [59]:
len(x_train), len(x_test)

In [60]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, train_labels_one_hot))
valid_dataset = tf.data.Dataset.from_tensor_slices((x_test, val_labels_one_hot))

In [61]:
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
valid_dataset = valid_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [62]:
import tensorflow_hub as hub
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                trainable=False,
                                name='universal_sentence_encoder')

In [73]:
inputs = layers.Input(shape=[], dtype=tf.string)
pretrained_embedding = embedding_layer(inputs)
x = layers.Dense(128, activation="relu")(pretrained_embedding)
x = layers.Dropout(0.2)(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(3, activation="softmax")(x)

model_4 = tf.keras.Model(inputs, outputs)

model_4.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [74]:
model_4.summary()

In [70]:
len(train_dataset)

In [75]:
model_4.fit(train_dataset,
            steps_per_epoch = len(train_dataset),
            validation_data=valid_dataset,
            validation_steps=len(valid_dataset),
            epochs=5)