In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold

In [None]:
notes=pd.read_csv('Final_SWmerged.csv')

**Pre-processing**

In [None]:
notes["TEXT"] = [note.lower() for note in notes["TEXT"]]

In [None]:
def clean_text(df):
  text = str(df["TEXT"])
  clean = re.sub(r"\n","",text)
  cleaner = re.sub(r"  ","",clean)

  return cleaner

In [None]:
notes["TEXT_CLEAN"] = notes.apply(clean_text, axis=1)

In [None]:
labels = notes.iloc[:,11:15]

## Classifier

In [None]:
#Vectorize text
countvec_bow = CountVectorizer(ngram_range = (1,3), stop_words='english')
list_notes = list(notes["TEXT_CLEAN"])
notes_bow = countvec_bow.fit_transform(list_notes)

In [None]:
#Convert x and y to np arrays
label_array = np.array(labels)

In [None]:
def LR_trainer(X_trn,y_trn,l):

  p_avgs = []
  r_avgs = []
  f1_avgs = []

  p_idvs = []
  r_idvs = []
  f1_idvs = []

  lr = LogisticRegression(penalty='none',C=1/l) #solver='saga',l1_ratio=0.5
  mt_lr = MultiOutputClassifier(lr, n_jobs=1)

  kf = KFold(n_splits=5, shuffle=True,random_state= 1)

  for train_index, test_index in kf.split(X_trn):
    X_train, X_test = X_trn[train_index], X_trn[test_index]
    y_train, y_test = y_trn[train_index], y_trn[test_index]

    fit_model = mt_lr.fit(X_train,y_train)
    y_pred = fit_model.predict(X_test)

    #Precision, recall, f-score

    p_avg = precision_score(y_test, y_pred, average='macro')
    r_avg = recall_score(y_test, y_pred, average='macro')
    f_avg = f1_score(y_test, y_pred, average='macro')

    p_idv = precision_score(y_test, y_pred, average=None)
    r_idv = recall_score(y_test, y_pred, average=None)
    f_idv = f1_score(y_test, y_pred, average=None)

    p_avgs.append(p_avg)
    r_avgs.append(r_avg)
    f1_avgs.append(f_avg)

    p_idvs.append(p_idv)
    r_idvs.append(r_idv)
    f1_idvs.append(f_idv)

  p_avg_cv = sum(p_avgs)/5
  r_avg_cv = sum(r_avgs)/5
  f1_avg_cv = sum(f1_avgs)/5

  p_std = np.std(p_avgs)
  r_std = np.std(r_avgs)
  f1_std = np.std(f1_avgs)

  p_idv_cv = sum(p_idvs)/5
  r_idv_cv = sum(r_idvs)/5
  f1_idv_cv = sum(f1_idvs)/5

  p_idv_std = np.std(p_idvs,axis=0)
  r_idv_std = np.std(r_idvs,axis=0)
  f1_idv_std = np.std(f1_idvs,axis=0)

  #Calculating CIs

  p_ci = []
  r_ci = []
  f1_ci = []

  p_ci_lb = p_avg_cv - (1.96*p_std)
  p_ci_ub = p_avg_cv + (1.96*p_std)
  p_ci.append(p_ci_lb)
  p_ci.append(p_ci_ub)

  r_ci_lb = r_avg_cv - (1.96*r_std)
  r_ci_ub = r_avg_cv + (1.96*r_std)
  r_ci.append(r_ci_lb)
  r_ci.append(r_ci_ub)

  f1_ci_lb = f1_avg_cv - (1.96*f1_std)
  f1_ci_ub = f1_avg_cv + (1.96*f1_std)
  f1_ci.append(f1_ci_lb)
  f1_ci.append(f1_ci_ub)

  #CIs for each category

  #Precision

  p_ci_ES = []
  p_ci_CF = []
  p_ci_C = []
  p_ci_PA = []

  p_ES_lb = p_idv_cv[0] - (1.96*p_idv_std[0])
  p_ES_ub = p_idv_cv[0] + (1.96*p_idv_std[0])
  p_ci_ES.append(p_ES_lb)
  p_ci_ES.append(p_ES_ub)

  p_CF_lb = p_idv_cv[1] - (1.96*p_idv_std[1])
  p_CF_ub = p_idv_cv[1] + (1.96*p_idv_std[1])
  p_ci_CF.append(p_CF_lb)
  p_ci_CF.append(p_CF_ub)

  p_C_lb = p_idv_cv[2] - (1.96*p_idv_std[2])
  p_C_ub = p_idv_cv[2] + (1.96*p_idv_std[2])
  p_ci_C.append(p_C_lb)
  p_ci_C.append(p_C_ub)

  p_PA_lb = p_idv_cv[3] - (1.96*p_idv_std[3])
  p_PA_ub = p_idv_cv[3] + (1.96*p_idv_std[3])
  p_ci_PA.append(p_PA_lb)
  p_ci_PA.append(p_PA_ub)

  #Recall

  r_ci_ES = []
  r_ci_CF = []
  r_ci_C = []
  r_ci_PA = []

  r_ES_lb = r_idv_cv[0] - (1.96*r_idv_std[0])
  r_ES_ub = r_idv_cv[0] + (1.96*r_idv_std[0])
  r_ci_ES.append(r_ES_lb)
  r_ci_ES.append(r_ES_ub)

  r_CF_lb = r_idv_cv[1] - (1.96*r_idv_std[1])
  r_CF_ub = r_idv_cv[1] + (1.96*r_idv_std[1])
  r_ci_CF.append(r_CF_lb)
  r_ci_CF.append(r_CF_ub)

  r_C_lb = r_idv_cv[2] - (1.96*r_idv_std[2])
  r_C_ub = r_idv_cv[2] + (1.96*r_idv_std[2])
  r_ci_C.append(r_C_lb)
  r_ci_C.append(r_C_ub)

  r_PA_lb = r_idv_cv[3] - (1.96*r_idv_std[3])
  r_PA_ub = r_idv_cv[3] + (1.96*r_idv_std[3])
  r_ci_PA.append(r_PA_lb)
  r_ci_PA.append(r_PA_ub)

  #F1

  f1_ci_ES = []
  f1_ci_CF = []
  f1_ci_C = []
  f1_ci_PA = []

  f1_ES_lb = f1_idv_cv[0] - (1.96*f1_idv_std[0])
  f1_ES_ub = f1_idv_cv[0] + (1.96*f1_idv_std[0])
  f1_ci_ES.append(f1_ES_lb)
  f1_ci_ES.append(f1_ES_ub)

  f1_CF_lb = f1_idv_cv[1] - (1.96*f1_idv_std[1])
  f1_CF_ub = f1_idv_cv[1] + (1.96*f1_idv_std[1])
  f1_ci_CF.append(f1_CF_lb)
  f1_ci_CF.append(f1_CF_ub)

  f1_C_lb = f1_idv_cv[2] - (1.96*f1_idv_std[2])
  f1_C_ub = f1_idv_cv[2] + (1.96*f1_idv_std[2])
  f1_ci_C.append(f1_C_lb)
  f1_ci_C.append(f1_C_ub)

  f1_PA_lb = f1_idv_cv[3] - (1.96*f1_idv_std[3])
  f1_PA_ub = f1_idv_cv[3] + (1.96*f1_idv_std[3])
  f1_ci_PA.append(f1_PA_lb)
  f1_ci_PA.append(f1_PA_ub)

  return p_idv_cv,p_avg_cv,r_idv_cv,r_avg_cv,f1_idv_cv,f1_avg_cv,p_ci_ES,p_ci_CF,p_ci_C,p_ci_PA,p_ci,r_ci_ES,r_ci_CF,r_ci_C,r_ci_PA,r_ci,f1_ci_ES,f1_ci_CF,f1_ci_C,f1_ci_PA,f1_ci

In [None]:
LR_trainer(notes_bow,label_array,0.001) #Results are with elastic net

In [None]:
LR_trainer(notes_bow,label_array,1) #No regualrization

(array([0.86872482, 0.63037836, 0.44694444, 0.77529221]),
 0.6803349583321645,
 array([0.77238014, 0.43858748, 0.14373182, 0.43568554]),
 0.4475962459186184,
 array([0.81687138, 0.51408271, 0.21468628, 0.55561112]),
 0.5253128702968197,
 [0.7768105421823455, 0.960639095415154],
 [0.5535244480782523, 0.7072322765082596],
 [0.048597098650306425, 0.8452917902385826],
 [0.6481385786680296, 0.9024458369163859],
 [0.5694654751843837, 0.7912044414799453],
 [0.7421095060955312, 0.8026507759127359],
 [0.3102985121467259, 0.5668764418308837],
 [-0.01614988287354352, 0.3036135263081488],
 [0.3774175934525042, 0.49395349447596076],
 [0.37600145209870095, 0.5191910397385359],
 [0.7798611198362634, 0.8538816403180843],
 [0.42689473844746534, 0.6012706765558311],
 [-0.007545899910601506, 0.436918454474238],
 [0.5108462252611035, 0.6003760073921738],
 [0.45772726170751665, 0.5928984788861228])