In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold

from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

In [None]:
download('punkt') #tokenizer, run once
download('stopwords') #stopwords dictionary, run once
stop_words = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
notes=pd.read_csv('Final_SWmerged.csv')

**Pre-Processing**

In [None]:
notes["TEXT"] = [note.lower() for note in notes["TEXT"]]

In [None]:
def clean_text(df):
  text = str(df["TEXT"])
  clean = re.sub(r"\n","",text)
  cleaner = re.sub(r"  ","",clean)

  return cleaner

In [None]:
notes["TEXT_CLEAN"] = notes.apply(clean_text, axis=1)

In [None]:
text_clean = notes.apply(clean_text, axis=1)

In [None]:
#Tokenize words and remove punctuation
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [None]:
sw_words = list(sent_to_words(text_clean))

In [None]:
#Building bigrams and trigrams
bigram = gensim.models.Phrases(sw_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[sw_words], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [None]:
#Functions for creating bigrams and trimagrams in documents

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
# Make bigrams and trigrams
sw_words_bigrams = make_bigrams(sw_words)

sw_words_trigrams = make_trigrams(sw_words_bigrams)

## W2V Model

In [None]:
w2v_model = gensim.models.Word2Vec(
        sw_words_trigrams,
        size=100,
        window=5,
        min_count=5,
        workers=10,
        sg=1)

In [None]:
w2v_model.train(sw_words_trigrams,total_examples=len(sw_words_trigrams),epochs=10)

(2395472, 3553050)

In [None]:
#Checking avg length of notes

def text_splitter(df):
    text = df["TEXT_CLEAN"]
    content = text.split()
    return content

def avg_doc_length(corp):
    total = 0
    for doc in corp:
        total += len(doc)
    print (total)
    avg = total/len(corp)
    return avg

## Convert Notes into Embeddings

In [None]:
word_vectors = w2v_model.wv

In [None]:
def preprocess(text):
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

In [None]:
corpus = [preprocess(text) for text in text_clean]
labels = notes.iloc[:,11:15]

In [None]:
#Filtering documents to make sure at least one word in a note has a vector representation

In [None]:
def filter_docs(corpus, texts, condition_on_doc):
    """
    Filter corpus, texts and labels given the function condition_on_doc which takes
    a doc.
    The document doc is kept if condition_on_doc(doc) is true.
    """
    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if condition_on_doc(doc)]

    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    return (corpus, texts)

In [None]:
#Condition used for function above
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

In [None]:
corpus, text_clean  = filter_docs(corpus, text_clean, lambda doc: has_vector_representation(word_vectors, doc))

In [None]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    #Average the vectors of each word in the note
    return np.mean(word2vec_model[doc], axis=0)

In [None]:
doc_embedding =[]
for doc in corpus:
    doc_embedding.append(document_vector(word_vectors, doc))

In [None]:
note_w2v = np.array(doc_embedding)

## Train Classifier

In [None]:
#Convert y to np arrays
label_array = np.array(labels)

In [None]:
def LR_trainer(X_trn,y_trn,l):

  p_avgs = []
  r_avgs = []
  f1_avgs = []

  p_idvs = []
  r_idvs = []
  f1_idvs = []

  lr = LogisticRegression(penalty='none',C=1/l) #solver='saga',l1_ratio=0.5
  mt_lr = MultiOutputClassifier(lr, n_jobs=1)

  kf = KFold(n_splits=5, shuffle=True,random_state= 1)

  for train_index, test_index in kf.split(X_trn):
    X_train, X_test = X_trn[train_index], X_trn[test_index]
    y_train, y_test = y_trn[train_index], y_trn[test_index]

    fit_model = mt_lr.fit(X_train,y_train)
    y_pred = fit_model.predict(X_test)

    #Precision, recall, f-score

    p_avg = precision_score(y_test, y_pred, average='macro')
    r_avg = recall_score(y_test, y_pred, average='macro')
    f_avg = f1_score(y_test, y_pred, average='macro')

    p_idv = precision_score(y_test, y_pred, average=None)
    r_idv = recall_score(y_test, y_pred, average=None)
    f_idv = f1_score(y_test, y_pred, average=None)

    p_avgs.append(p_avg)
    r_avgs.append(r_avg)
    f1_avgs.append(f_avg)

    p_idvs.append(p_idv)
    r_idvs.append(r_idv)
    f1_idvs.append(f_idv)

  p_avg_cv = sum(p_avgs)/5
  r_avg_cv = sum(r_avgs)/5
  f1_avg_cv = sum(f1_avgs)/5

  p_std = np.std(p_avgs)
  r_std = np.std(r_avgs)
  f1_std = np.std(f1_avgs)

  p_idv_cv = sum(p_idvs)/5
  r_idv_cv = sum(r_idvs)/5
  f1_idv_cv = sum(f1_idvs)/5

  p_idv_std = np.std(p_idvs,axis=0)
  r_idv_std = np.std(r_idvs,axis=0)
  f1_idv_std = np.std(f1_idvs,axis=0)

  #Calculating CIs

  p_ci = []
  r_ci = []
  f1_ci = []

  p_ci_lb = p_avg_cv - (1.96*p_std)
  p_ci_ub = p_avg_cv + (1.96*p_std)
  p_ci.append(p_ci_lb)
  p_ci.append(p_ci_ub)

  r_ci_lb = r_avg_cv - (1.96*r_std)
  r_ci_ub = r_avg_cv + (1.96*r_std)
  r_ci.append(r_ci_lb)
  r_ci.append(r_ci_ub)

  f1_ci_lb = f1_avg_cv - (1.96*f1_std)
  f1_ci_ub = f1_avg_cv + (1.96*f1_std)
  f1_ci.append(f1_ci_lb)
  f1_ci.append(f1_ci_ub)

  #CIs for each category

  #Precision

  p_ci_ES = []
  p_ci_CF = []
  p_ci_C = []
  p_ci_PA = []

  p_ES_lb = p_idv_cv[0] - (1.96*p_idv_std[0])
  p_ES_ub = p_idv_cv[0] + (1.96*p_idv_std[0])
  p_ci_ES.append(p_ES_lb)
  p_ci_ES.append(p_ES_ub)

  p_CF_lb = p_idv_cv[1] - (1.96*p_idv_std[1])
  p_CF_ub = p_idv_cv[1] + (1.96*p_idv_std[1])
  p_ci_CF.append(p_CF_lb)
  p_ci_CF.append(p_CF_ub)

  p_C_lb = p_idv_cv[2] - (1.96*p_idv_std[2])
  p_C_ub = p_idv_cv[2] + (1.96*p_idv_std[2])
  p_ci_C.append(p_C_lb)
  p_ci_C.append(p_C_ub)

  p_PA_lb = p_idv_cv[3] - (1.96*p_idv_std[3])
  p_PA_ub = p_idv_cv[3] + (1.96*p_idv_std[3])
  p_ci_PA.append(p_PA_lb)
  p_ci_PA.append(p_PA_ub)

  #Recall

  r_ci_ES = []
  r_ci_CF = []
  r_ci_C = []
  r_ci_PA = []

  r_ES_lb = r_idv_cv[0] - (1.96*r_idv_std[0])
  r_ES_ub = r_idv_cv[0] + (1.96*r_idv_std[0])
  r_ci_ES.append(r_ES_lb)
  r_ci_ES.append(r_ES_ub)

  r_CF_lb = r_idv_cv[1] - (1.96*r_idv_std[1])
  r_CF_ub = r_idv_cv[1] + (1.96*r_idv_std[1])
  r_ci_CF.append(r_CF_lb)
  r_ci_CF.append(r_CF_ub)

  r_C_lb = r_idv_cv[2] - (1.96*r_idv_std[2])
  r_C_ub = r_idv_cv[2] + (1.96*r_idv_std[2])
  r_ci_C.append(r_C_lb)
  r_ci_C.append(r_C_ub)

  r_PA_lb = r_idv_cv[3] - (1.96*r_idv_std[3])
  r_PA_ub = r_idv_cv[3] + (1.96*r_idv_std[3])
  r_ci_PA.append(r_PA_lb)
  r_ci_PA.append(r_PA_ub)

  #F1

  f1_ci_ES = []
  f1_ci_CF = []
  f1_ci_C = []
  f1_ci_PA = []

  f1_ES_lb = f1_idv_cv[0] - (1.96*f1_idv_std[0])
  f1_ES_ub = f1_idv_cv[0] + (1.96*f1_idv_std[0])
  f1_ci_ES.append(f1_ES_lb)
  f1_ci_ES.append(f1_ES_ub)

  f1_CF_lb = f1_idv_cv[1] - (1.96*f1_idv_std[1])
  f1_CF_ub = f1_idv_cv[1] + (1.96*f1_idv_std[1])
  f1_ci_CF.append(f1_CF_lb)
  f1_ci_CF.append(f1_CF_ub)

  f1_C_lb = f1_idv_cv[2] - (1.96*f1_idv_std[2])
  f1_C_ub = f1_idv_cv[2] + (1.96*f1_idv_std[2])
  f1_ci_C.append(f1_C_lb)
  f1_ci_C.append(f1_C_ub)

  f1_PA_lb = f1_idv_cv[3] - (1.96*f1_idv_std[3])
  f1_PA_ub = f1_idv_cv[3] + (1.96*f1_idv_std[3])
  f1_ci_PA.append(f1_PA_lb)
  f1_ci_PA.append(f1_PA_ub)

  return p_idv_cv,p_avg_cv,r_idv_cv,r_avg_cv,f1_idv_cv,f1_avg_cv,p_ci_ES,p_ci_CF,p_ci_C,p_ci_PA,p_ci,r_ci_ES,r_ci_CF,r_ci_C,r_ci_PA,r_ci,f1_ci_ES,f1_ci_CF,f1_ci_C,f1_ci_PA,f1_ci

In [None]:
LR_trainer(note_w2v,label_array,1) #No regularization

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

(array([0.74462592, 0.52129926, 0.35245098, 0.57458272]),
 0.5482397203297614,
 array([0.70279152, 0.37493673, 0.17383227, 0.40182191]),
 0.41334560849693247,
 array([0.72227498, 0.4329697 , 0.23107938, 0.47201988]),
 0.4645859852912982,
 [0.6878486914914329, 0.8014031525549318],
 [0.3500493785117351, 0.6925491347813966],
 [0.14976419847290395, 0.5551377623114098],
 [0.3994045829137326, 0.7497608616005492],
 [0.4469607255192356, 0.6495187151402873],
 [0.6508195563501222, 0.7547634909911567],
 [0.2335951921582677, 0.5162782757553736],
 [0.09867837430049238, 0.24898616321885458],
 [0.2776001851661295, 0.5260436300350628],
 [0.343562272006384, 0.48312894498748093],
 [0.6962720081454788, 0.7482779511141066],
 [0.29928550807070997, 0.566653885868684],
 [0.12036218937818266, 0.3417965699991685],
 [0.3322053609841673, 0.6118344087698881],
 [0.3843319312660298, 0.5448400393165665])