In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import PredefinedSplit
import re

In [None]:
from azure.ai.ml import MLClient#, Input, command
from azure.identity import DefaultAzureCredential
import sys
sys.path.append("..")
from utils import azure_ml_configs

workspace_id = azure_ml_configs.workspace_id
subscription_id = azure_ml_configs.subscription_id
resource_group = azure_ml_configs.resource_group
workspace_name = azure_ml_configs.workspace_name

# Get a handle to the workspace
ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name,
)

data_asset = ml_client.data.get(name="clinicalNote_AcuteReadmission", version=1) 
print(f"Data asset URI: {data_asset.path}")

In [None]:
# args
discharge_notes_only = False

In [None]:
def preproc(text):
    # removing some punctuation and lower-casing
    punct = ",.:;\"" 
    text = re.sub('\s+',' ',text.lower())
    text = text.translate(str.maketrans('', '', punct))
    return text

In [None]:
# loading and prepraring data
cols = ["text_names_removed_step2", "Acute", "set", "Type", "PatientDurableKey", "EncounterKey", "CreationInstant"]
df = pd.read_csv(data_asset.path, usecols=cols)
# make sure the data is sorted by patient id, encounter and date
df.sort_values(by=["PatientDurableKey", "EncounterKey", "CreationInstant"],inplace=True)
#rename main columns of interest
df.rename(columns={"text_names_removed_step2": "text", "Acute": "label"}, inplace=True)

print(len(df.EncounterKey.unique()))

if discharge_notes_only:
    df = df[df["Type"].str.contains("Udskrivningsresume|Udskrivningsresum√©")==True].copy()

df["text"] = df.text.apply(lambda x: preproc(x))

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = (df[df.set=="train"].text.values,
                                                  df[df.set=="val"].text.values,
                                                  df[df.set=="test"].text.values,
                                                  df[df.set=="train"].label.values,
                                                  df[df.set=="val"].label.values,
                                                  df[df.set=="test"].label.values
                                                 )

In [None]:
# creating indices for grid search to use the predefined validation set for validation

split_index = [-1]*len(X_train)+[0]*len(X_val)
X = np.concatenate((X_train, X_val), axis=0)
y = np.concatenate((y_train, y_val), axis=0)

In [None]:
# Grid search

pds = PredefinedSplit(split_index)

pipeline = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=0,
                               solver='saga',
                               #C=np.inf, #C=np.inf to fit without regularization
                               max_iter=10000,
                               #n_jobs=5
                               class_weight="balanced"
                              )),
])

param_grid = {"vect__max_features": [5000],
             "vect__ngram_range": [(1,1), (1,2), (1,3), (1,4)],
             "tfidf__use_idf": [True, False]
             }

search = GridSearchCV(pipeline, 
                      param_grid, 
                      scoring={"AP":"average_precision","AUC": "roc_auc"}, 
                      cv=pds, # specifing the predifined split for validation
                      refit="AP",
                      return_train_score=True,
                      verbose=10,
                      n_jobs=40)

search.fit(X,y)

print('Best parameter set: %s ' % search.best_params_)
print('Best score: ',search.best_score_)

In [None]:
# Re-fitting best and saving results (coefficients)

vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_val_vect = vectorizer.transform(X_val)
# save features
feats = ['_'.join(s.split()) for s in vectorizer.get_feature_names()]

clf =  LogisticRegression(random_state=0,
                               solver='saga',
                               #C=np.inf, #C=np.inf to fit without regularization
                               max_iter=10000,
                               class_weight="balanced",
                             n_jobs=40)
clf.fit(X_train_vect,y_train)

allcoefs = pd.DataFrame.from_records(clf.coef_, columns=feats)
allcoefs.to_csv("logreg_all_coefs_NOT_dedup.csv", sep="\t", index=False)

In [None]:
X_test_vect = vectorizer.transform(X_test)

preds_val = clf.predict(X_val_vect)
preds_test = clf.predict(X_test_vect)
preds_train = clf.predict(X_train_vect)

probs_val = clf.predict_proba(X_val_vect)
pos_probs_val = probs_val[:,1:].flatten()

probs_train = clf.predict_proba(X_train_vect)
pos_probs_train = probs_train[:,1:].flatten()

probs_test = clf.predict_proba(X_test_vect)
pos_probs_test = probs_test[:,1:].flatten()


def get_temp_df(df,pos_probs, labels, split):
    p_e = []
    ps = df[df.set==split].PatientDurableKey.values
    es = df[df.set==split].EncounterKey.values
    
    for p,e in list(zip(ps,es)):
        pe = str(p)+"_"+str(e)
        p_e.append(pe)
    
    votedf = pd.DataFrame({"ID":p_e, 
                           "Label":labels,
                           "pred_score": pos_probs})
    
    p_mean = votedf.groupby(["ID"],as_index=False).mean()["pred_score"].values
    p_max = votedf.groupby(["ID"],as_index=False).max()["pred_score"].values
    ids = votedf.groupby(["ID"],as_index=False).max()["ID"].values
    n = votedf.groupby(["ID"],as_index=False).count()["pred_score"].values
    target = votedf.groupby(["ID"],as_index=False).max()["Label"].values
    
    temp = pd.DataFrame({"ID":ids,"target":target,"p_mean":p_mean,"p_max":p_max,"n":n})
    
    c=2
    
    temp["p"] = temp.apply(lambda row: (row["p_max"]+row["p_mean"]*(row["n"]/c))/(1+(row["n"]/c)), axis=1)
    
    return temp

In [None]:
temp_test = get_temp_df(df,pos_probs_test, y_test, "test")
temp_test.to_csv("LogRegAllNotDeDupBest_temp_test.csv")

In [None]:
import sys
# setting path
sys.path.append('..')
from utils.eval_utils import plot_auc_curve, MCCF1_threshold_and_metrics

model_name = "Logistic Regression (All notes)"

temp_train = get_temp_df(df,pos_probs_train, y_train, "train")
temp_val = get_temp_df(df,pos_probs_val, y_val, "val")
temp_test = get_temp_df(df,pos_probs_test, y_test, "test")

t=0.5

print("TEST RESULTS")
plot_auc_curve(temp_test, model_name)
MCCF1_threshold_and_metrics(temp_train, temp_test,threshold=t, show_train_performance=True)