<a href="https://colab.research.google.com/github/woodword-0/ML-Projects/blob/main/NLPSJ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from tensorflow import  keras
import tensorflow as tf
from tensorflow.keras import layers
import seaborn as sns
import pandas as pd
import numpy as np
import tensorflow_hub as hub
from matplotlib import pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string


In [None]:
data = pd.read_csv("/content/mtsamples.csv")

In [None]:
(data.to_numpy())[3][4]

In [None]:
df = data[['description', 'medical_specialty']]

In [None]:
specialties  = df.groupby(df['medical_specialty'])

i = 1
for specialty, number in specialties:
    print(str(i)+' '+specialty + ' : '+ str(len(number)) )
    i = i+1

In [None]:
df[df['medical_specialty']==' consult - history and phy.']

In [None]:
#convert all text into lower case
df = df.apply(lambda x: x.astype(str).str.lower())

In [None]:
#dropping record with no standard specialty
df = df.drop(df[(df.medical_specialty == ' soap / chart / progress notes') ].index)
df = df.drop(df[(df.medical_specialty == ' office notes') ].index)
df = df.drop(df[(df.medical_specialty == ' letters') ].index)
df = df.drop(df[(df.medical_specialty == ' lab medicine - pathology') ].index)
df = df.drop(df[(df.medical_specialty == ' ime-qme-work comp etc.') ].index)
df = df.drop(df[(df.medical_specialty == ' emergency room reports') ].index)
df = df.drop(df[(df.medical_specialty == ' discharge summary') ].index)
df = df.drop(df[(df.medical_specialty == ' consult - history and phy.') ].index)


In [None]:
#dropping specialties with records<50
df = df.drop(df[(df.medical_specialty == ' podiatry') ].index)
df = df.drop(df[(df.medical_specialty == ' dermatology') ].index)
df = df.drop(df[(df.medical_specialty == ' cosmetic/plastic surgery') ].index)
df = df.drop(df[(df.medical_specialty == ' dentistry') ].index)
df = df.drop(df[(df.medical_specialty == ' physical medicine - rehab') ].index)
df = df.drop(df[(df.medical_specialty == ' sleep medicine') ].index)
df = df.drop(df[(df.medical_specialty == ' endocrinology') ].index)
df = df.drop(df[(df.medical_specialty == ' bariatrics') ].index)
df = df.drop(df[(df.medical_specialty == ' chiropractic') ].index)
df = df.drop(df[(df.medical_specialty == ' diets and nutritions') ].index)
df = df.drop(df[(df.medical_specialty == ' rheumatology') ].index)
df = df.drop(df[(df.medical_specialty == ' speech - language') ].index)
df = df.drop(df[(df.medical_specialty == ' autopsy') ].index)
df = df.drop(df[(df.medical_specialty == ' allergy / immunology') ].index)
df = df.drop(df[(df.medical_specialty == ' hospice - palliative care') ].index)
df = df.drop(df[(df.medical_specialty == ' surgery') ].index)


In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='medical_specialty', data = df )
plt.show()

In [None]:
df.head()

In [None]:
def clean_text(description_): 
    description_ = description_.translate(str.maketrans('', '', string.punctuation))
    description_1 = ''.join([i for i in description_ if not i.isdigit()])
    

    return description_1

In [None]:
#remove all punctuations and digits from the description column
df['description'] = df['description'].apply(clean_text)

In [None]:
df

In [None]:
#Lemmatizing with appropriate POS tag

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize_text(description_):
    individual_word_list=[]
    lemmatizer = WordNetLemmatizer() 
    words = word_tokenize(description_)
    stop_words = stopwords.words('english')

    words_without_sw = [word for word in words if not word in stop_words]

    for word in words_without_sw:
      individual_word_list.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
           
    return ' '.join(individual_word_list)

In [None]:
#remove all punctuations and digits from the description column
df['description'] = df['description'].apply(lemmatize_text)

In [None]:
 from collections import Counter
 from tqdm import tqdm
 import math
 import operator
 import numpy as np 
 from scipy.sparse import csr_matrix

In [None]:
corpus_arr = df['description'].to_numpy()
corpus = corpus_arr.flatten().tolist()

In [None]:
def IDF(corpus, unique_words):
  idf_dict={}
  N=len(corpus)
  for i in unique_words:
    count=0
    for sen in corpus:
      if i in sen.split():
        count=count+1
      idf_dict[i]=(math.log((1+N)/(count+1)))+1
  return idf_dict 

In [None]:
def fit(whole_data):
  unique_words = set()
  if isinstance(whole_data, (list,)):
    for x in whole_data:
      for y in x.split():
        if len(y)<2:
          continue
        unique_words.add(y)
    unique_words = sorted(list(unique_words))
    vocab = {j:i for i,j in enumerate(unique_words)}
    Idf_values_of_all_unique_words=IDF(whole_data,unique_words)
    
    return vocab, Idf_values_of_all_unique_words
    
Vocabulary, idf_of_vocabulary=fit(corpus) 

In [None]:
 def transform(dataset,vocabulary,idf_values):
     sparse_matrix= csr_matrix( (len(dataset), len(vocabulary)), dtype=np.float64)
     for row  in range(0,len(dataset)):
       number_of_words_in_sentence=Counter(dataset[row].split())
       for word in dataset[row].split():
           if word in  list(vocabulary.keys()):
               tf_idf_value=(number_of_words_in_sentence[word]/len(dataset[row].split()))*(idf_values[word])
               sparse_matrix[row,vocabulary[word]]=tf_idf_value
     return sparse_matrix
 final_output=transform(corpus,Vocabulary,idf_of_vocabulary)
 

In [None]:
set(df['medical_specialty'])

In [None]:
df['medical_specialty_code'] = df['medical_specialty']

In [None]:
df.shape

In [None]:
df['medical_specialty'].value_counts()

In [None]:
#manual encoding of y variable classes
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' cardiovascular / pulmonary', 1)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' cosmetic / plastic surgery', 2)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' ent - otolaryngology', 3)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' gastroenterology', 4)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' general medicine', 5)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' hematology - oncology', 6)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' nephrology', 7)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' neurology', 8)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' neurosurgery', 9)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' obstetrics / gynecology', 10)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' ophthalmology', 11)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' orthopedic', 12)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' pain management', 13)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' pediatrics - neonatal', 14)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' psychiatry / psychology', 15)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' radiology', 16)
df['medical_specialty_code'] = df['medical_specialty_code'].replace(' urology', 17)
# df['medical_specialty_code'] = df['medical_specialty_code'].replace(' surgery',0)


In [None]:
set(df.medical_specialty_code)

In [None]:
df.sample(frac=1)
df['medical_specialty_code'].sample(frac=1)

In [None]:
enc_nom_1 = (df.groupby('description').size()) 
enc_nom_1
df['description'] = df['description'].apply(lambda x : enc_nom_1[x])

In [None]:
X = df.drop(['medical_specialty'], axis=1)
X1_train = X.iloc[:3033].to_numpy().reshape((1,-1))
X1_validate = X[3033:3502].to_numpy()#.reshape((1,-1))
X1_test = X[3502:3791].to_numpy()#.reshape((1,-1))
y1_train = df['medical_specialty_code'].iloc[:3303].to_numpy().reshape((1,-1))
y1_validate = df['medical_specialty_code'].iloc[3033:3502].to_numpy()
y1_test = df['medical_specialty_code'].iloc[3502:3791].to_numpy()

In [None]:
X1_train.shape

(1, 5376)

In [None]:
y1_test

array([], dtype=int64)

In [None]:
# final_output_arr = final_output.toarray()

# X_train = final_output_arr[:3033]
# X_val = final_output_arr[3033:3502]
# X_test = final_output_arr[3502:3791]
# y_train = df['medical_specialty_code'].iloc[:3033].to_numpy()#.reshape((-1,1))
# y_val = df['medical_specialty_code'].iloc[3033:3502].to_numpy()#.reshape((-1,1))
# y_test = df['medical_specialty_code'].iloc[3502:3791].to_numpy()#.reshape((-1,1))

In [None]:
final_output_arr = final_output.toarray()

# Define a size for your train set 
train_size = int(0.8* len(df))
val_size = int(0.1* len(df))
test_size = int(0.1* len(df))


X_train = final_output_arr[:train_size]
X_val = final_output_arr[train_size:(train_size+val_size)]
X_test = final_output_arr[(train_size+val_size):]
y_train = df['medical_specialty_code'].iloc[:train_size].to_numpy()#.reshape((-1,1))
y_val = df['medical_specialty_code'].iloc[train_size:(train_size+val_size)].to_numpy()#.reshape((-1,1))
y_test = df['medical_specialty_code'].iloc[(train_size+val_size):].to_numpy()#.reshape((-1,1))

In [None]:
X_train.shape

(2150, 4667)

In [None]:
y_val


In [None]:
def softmax(h):
  return (np.exp(h.T)/np.sum(np.exp(h), axis=1)).T


def cross_entropy(Y,P_hat):
  return -(1/len(Y))* np.sum(np.sum(Y*np.log(P_hat), axis=1),axis=0)


def accuracy(y, y_hat):
  return np.mean(y == y_hat)

def indices_to_one_hot(data, nb_classes):
  targets = np.array(data).reshape(-1)
  return np.eye(nb_classes)[targets]

In [None]:
class MVLogisticRegression():
  def __init__(self,thresh=0.5):
    self.thresh = thresh

  
  def fit(self, X, y, eta=2e-1, epochs=1e3, show_curve = False):
    epochs = int(epochs)
    N,D = X.shape

    K = len(np.unique(y)) + 3
    y_values = np.unique(y, return_index = False)
    Y = indices_to_one_hot(y, K).astype(int)
    self.W = np.random.randn(D, K)
    self.B = np.random.rand(1, K)
    J = np.zeros(int(epochs))

    for epoch in range(epochs):
      P_hat = self.__forward__(X)
      J[epoch]=cross_entropy(Y, P_hat)
      self.W -= eta*(1/N)*X.T@(P_hat - Y)
      self.B -= eta*(1/N)*np.sum(P_hat - Y)


    if show_curve:
      plt.figure()
      plt.plot(J)
      plt.xlabel('epochs')
      plt.ylabel("$\matchcal{J}")
      plt.title("Training Curve")
      plt.show()
    
  def __forward__(self, X):
    return softmax(X@self.W + self.B)

  def predict(self, X):
    return np.argmax(self.__forward__(X), axis = 1)
  

Model 1

In [None]:
logreg = MVLogisticRegression()

In [None]:
logreg.fit(X_train,y_train,eta=5e-1, epochs=1e4, show_curve=True)

In [None]:
y_hat = logreg.predict(X_test)

In [None]:
accuracy(y_test,y_hat)

In [None]:
X_train.shape

Model 2

In [None]:
model = tf.keras.Sequential()
# model.add(hub_layer)
model.add(tf.keras.layers.Dense(64,activation='relu',input_dim = 4667))
model.add(tf.keras.layers.Dense(32,activation='relu'))
model.add(tf.keras.layers.Dense(16,activation='relu'))
model.add(tf.keras.layers.Dense(32,activation='relu'))
model.add(tf.keras.layers.Dense(18,activation='softmax'))

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])

# model.compile(optimizer='Adam',loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'])

In [None]:
history=model.fit(X_train,y_train,epochs=20, validation_data=(X_val, y_val),verbose=1,validation_split=0.1)

In [None]:
results=model.evaluate(X_test,y_test, verbose=2)

In [None]:
y_hat = model.predict(X_test)


In [None]:
y_test.shape

(270,)

In [None]:
y_hat.shape

(270, 18)

In [None]:
def accuracy(y, y_hat):
  return np.mean(y == y_hat)

In [None]:
accuracy(y_test, y_hat)

In [None]:
prediction = model.predict(transform(X_train,'stomach pain',idf_values))

In [None]:
prediction

Model 3

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
hub_layer = hub.KerasLayer(embed, input_shape = [], dtype = tf.string, trainable= True) #trainable freezes weights turn on or off #Tokenizing Layer


In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(4667,activation = 'relu')) 
model.add(tf.keras.layers.Dense(19, activation = "softmax")) #here we build a simple classifier

In [None]:
model.summary()

In [None]:
model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics= 'accuracy'
              ) #from logits takes into account probability

In [None]:
history = model.fit(X1_train,y1_train.T, epochs = 20,verbose = 1)

In [None]:
# X = np.random.randint(0,10, (1000,100))
# y = np.random.randint(0,3, 1000)

model = tf.keras.Sequential([
    Dense(128, input_dim = 4781),
    Dense(18, activation='softmax'),
])
model.summary()
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=3)

In [None]:
X = np.random.randint(0,10, (1000,100))
y = np.random.randint(0,3, 1000)

In [None]:
X_train.shape

In [None]:
X.shape

In [None]:
y.shape