<a href="https://colab.research.google.com/github/szhou52/HS-651/blob/main/30_day_readmission_longformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -q pyspark==3.1.3 spark-nlp

In [None]:
import sparknlp

spark = sparknlp.start(gpu=True)

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  3.4.4
Apache Spark version:  3.1.3


In [None]:
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [None]:
from pyspark.ml.feature import SQLTransformer
from pyspark.ml.feature import StringIndexer

In [None]:
from pyspark.sql import SQLContext

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
readmission=pd.read_csv('gdrive/MyDrive/Colab_notebook/df_adm_dis_sum_not_cleaned.csv')

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test=train_test_split(readmission,test_size=0.2, random_state=42)

# sub-sampling the negatives (non-readmitted) on the training set
df_train_readm=df_train[df_train.READMISSION_STATUS=='Readmitted']
df_train_non_readm=df_train[df_train.READMISSION_STATUS=='Non-readmitted']
df_train_sub = pd.concat([df_train_readm, df_train_non_readm.sample(n = len(df_train_readm), random_state = 42)],axis = 0)

In [None]:
# Convert the pandas df to a spark df
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
train = spark.createDataFrame(df_train_sub)
test= spark.createDataFrame(df_test)

In [None]:
len(df_test)

9262

In [None]:
len(df_train_sub)

4522

Pipeline

In [None]:
%%time
# Produce pipeline for data cleaning and sentence(discharge summary) embedding
document_assembler = DocumentAssembler() \
      .setInputCol("TEXT_AGG") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
    
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

lemmatizer = Lemmatizer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma") \
    .setDictionary("gdrive/MyDrive/Colab_notebook/AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

longformer_embeddings = LongformerEmbeddings\
      .pretrained("longformer_base_4096")\
      .setInputCols(["document", "lemma"])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)\
      .setMaxSentenceLength(4096)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("document_embeddings") \
      .setPoolingStrategy("AVERAGE")
    
embeddings_finisher = EmbeddingsFinisher() \
      .setInputCols(["document_embeddings"]) \
      .setOutputCols(["finished_sentence_embeddings"]) \
      .setOutputAsVector(True)\
      .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
      "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

label_stringIdx = StringIndexer(inputCol = "READMISSION_STATUS", outputCol = "label")

nlp_pipeline_longformer = Pipeline(
stages=[document_assembler, 
          tokenizer,
            normalizer,
            stopwords_cleaner,
            lemmatizer,
            longformer_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            explodeVectors,
            label_stringIdx])

longformer_base_4096 download started this may take some time.
Approximate size to download 343.3 MB
[OK!]
CPU times: user 376 ms, sys: 53.9 ms, total: 430 ms
Wall time: 1min 10s


In [None]:
# nlp_longformer_model=nlp_pipeline_longformer.fit(train)

In [None]:
# nlp_longformer_model.write().overwrite().save('gdrive/MyDrive/Colab_notebook/Models_Pipelines/longformer2')

In [None]:
from pyspark.ml.pipeline import PipelineModel
nlp_longformer_model= PipelineModel.load("gdrive/MyDrive/Colab_notebook/Models_Pipelines/longformer2/")

In [None]:
processed_train=nlp_longformer_model.transform(train)

In [None]:
processed_test=nlp_longformer_model.transform(test)

In [None]:
processed_train=processed_train.select('features','label')

In [None]:
processed_test=processed_test.select('features','label')

In [None]:
processed_train=processed_train.repartition(100)

In [None]:
processed_test=processed_test.repartition(100)

In [None]:
# Reduce the size of test set for faster processing
# processed_test_sm,processed_test_re=processed_test.randomSplit([0.15,0.85])

In [None]:
# processed_train1,processed_train2,processed_train3,processed_train4=processed_train.randomSplit([0.25,0.25,0.25,0.25])

In [None]:
# processed_train1.write.parquet("gdrive/MyDrive/Colab_notebook/transformed_data/long_train1")

In [None]:
# processed_train2.write.parquet("gdrive/MyDrive/Colab_notebook/transformed_data/long_train2")

In [None]:
# processed_train3.write.parquet("gdrive/MyDrive/Colab_notebook/transformed_data/long_train3")

In [None]:
# processed_train4.write.parquet("gdrive/MyDrive/Colab_notebook/transformed_data/long_train4")

In [None]:
# processed_test1,processed_test2=processed_test_sm.randomSplit([0.5,0.5])

In [None]:
# processed_test1.write.parquet("gdrive/MyDrive/Colab_notebook/transformed_data/long_test1")

In [None]:
# processed_test2.write.parquet("gdrive/MyDrive/Colab_notebook/transformed_data/long_test2")

In [3]:
# long_train1=pd.read_parquet('gdrive/MyDrive/Colab_notebook/transformed_data/long_train1')

In [4]:
# long_train2=pd.read_parquet('gdrive/MyDrive/Colab_notebook/transformed_data/long_train2')

In [5]:
# long_train3=pd.read_parquet('gdrive/MyDrive/Colab_notebook/transformed_data/long_train3')

In [6]:
# long_train4=pd.read_parquet('gdrive/MyDrive/Colab_notebook/transformed_data/long_train4')

In [8]:
# train_list=[long_train1,long_train2,long_train3,long_train4]

In [9]:
# pd_train=pd.concat(train_list)

In [11]:
# long_test1=pd.read_parquet('gdrive/MyDrive/Colab_notebook/transformed_data/long_test1')
# long_test2=pd.read_parquet('gdrive/MyDrive/Colab_notebook/transformed_data/long_test2')
# test_list=[long_test1,long_test2]
# pd_test=pd.concat(test_list)

In [12]:
# pd_train.to_csv('gdrive/MyDrive/Colab_notebook/transformed_data/long_train.csv')
# pd_test.to_csv('gdrive/MyDrive/Colab_notebook/transformed_data/long_test.csv')

In [13]:
pd_train=pd.read_csv('gdrive/MyDrive/Colab_notebook/transformed_data/long_train.csv')
pd_test=pd.read_csv('gdrive/MyDrive/Colab_notebook/transformed_data/long_test.csv')

In [14]:
X_train=pd_train.features
y_train=pd_train.label
X_test=pd_test.features
y_test=pd_test.label

In [35]:
X_train_trans=[]
for doc in X_train:
  begin=X_train[1].index('([')+2
  end=-3
  ori_embedding=X_train[1][begin:end]
  # Remove\n and space
  ori_embedding=ori_embedding.replace('\n','')
  ori_embedding=ori_embedding.replace(' ','')
  embedding_list=ori_embedding.split(",")
  str_to_num_list=[]
  for num_str in embedding_list:
    str_to_num_list.append(float(num_str))
  X_train_trans.append(str_to_num_list)

In [38]:
X_test_trans=[]
for doc in X_test:
  begin=X_test[1].index('([')+2
  end=-3
  ori_embedding=X_test[1][begin:end]
  # Remove\n and space
  ori_embedding=ori_embedding.replace('\n','')
  ori_embedding=ori_embedding.replace(' ','')
  embedding_list=ori_embedding.split(",")
  str_to_num_list=[]
  for num_str in embedding_list:
    str_to_num_list.append(float(num_str))
  X_test_trans.append(str_to_num_list)

Modeling

In [42]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[?25l[K     |███▎                            | 10 kB 23.2 MB/s eta 0:00:01[K     |██████▌                         | 20 kB 10.2 MB/s eta 0:00:01[K     |█████████▉                      | 30 kB 8.5 MB/s eta 0:00:01[K     |█████████████                   | 40 kB 4.4 MB/s eta 0:00:01[K     |████████████████▍               | 51 kB 4.3 MB/s eta 0:00:01[K     |███████████████████▋            | 61 kB 5.1 MB/s eta 0:00:01[K     |██████████████████████▉         | 71 kB 5.3 MB/s eta 0:00:01[K     |██████████████████████████▏     | 81 kB 5.3 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 92 kB 5.9 MB/s eta 0:00:01[K     |████████████████████████████████| 100 kB 4.1 MB/s 
Collecting pyaml>=16.9
  Downloading pyaml-21.10.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-21.10.1 scikit-optimize-0.9.0


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from skopt import BayesSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [45]:
# longformer vs Logistic regression
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

param= dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scoring='roc_auc'

logistic_clf_long = BayesSearchCV(estimator=LogisticRegression(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [46]:
logistic_clf_long.fit(X_train_trans,y_train)



BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=1),
              estimator=LogisticRegression(), n_jobs=-1, scoring='roc_auc',
              search_spaces={'C': [100, 10, 1.0, 0.1, 0.01], 'penalty': ['l2'],
                             'solver': ['newton-cg', 'lbfgs', 'liblinear']})

In [47]:
logistic_clf_long_best=logistic_clf_long.best_score_

In [48]:
logistic_clf_long_best

0.500523424086338

In [50]:
y_prob_logistic_clf_long = logistic_clf_long.predict_proba(X_test_trans)
roc_auc_y_prob_logistic_clf_long=roc_auc_score(y_test, y_prob_logistic_clf_long[:,1])

In [51]:
roc_auc_y_prob_logistic_clf_long

0.5

In [52]:
# longformer vs linear svm
c_values=[100, 10, 1.0, 0.1, 0.01]

param= dict(C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scoring='roc_auc'

lsvc_clf_long = BayesSearchCV(estimator=LinearSVC(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [53]:
lsvc_clf_long.fit(X_train_trans,y_train)



BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=1),
              estimator=LinearSVC(), n_jobs=-1, scoring='roc_auc',
              search_spaces={'C': [100, 10, 1.0, 0.1, 0.01]})

In [54]:
lsvc_clf_long_best=lsvc_clf_long.best_score_

In [55]:
lsvc_clf_long_best

0.5008146512958874

In [56]:
y_dec_func_lsvc_clf_long=lsvc_clf_long.decision_function(X_test_trans)
roc_auc_lsvc_clf_long=roc_auc_score(y_test, y_dec_func_lsvc_clf_long)

In [57]:
roc_auc_lsvc_clf_long

0.501901717465462

In [64]:
from sklearn.ensemble import RandomForestClassifier
max_depth=[5, 10, 15, 20]
min_samples_leaf=[5, 10, 20, 50, 100]
criterion=["gini", "entropy"]
n_estimators=[10,50,100,150]

param= dict(max_depth=max_depth,min_samples_leaf=min_samples_leaf,criterion=criterion,n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scoring='roc_auc'

rand_for_clf_long = BayesSearchCV(estimator=RandomForestClassifier(), search_spaces=param, scoring=scoring, n_jobs=-1, cv=cv)

In [65]:
rand_for_clf_long.fit(X_train_trans,y_train)



BayesSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=1),
              estimator=RandomForestClassifier(), n_jobs=-1, scoring='roc_auc',
              search_spaces={'criterion': ['gini', 'entropy'],
                             'max_depth': [5, 10, 15, 20],
                             'min_samples_leaf': [5, 10, 20, 50, 100],
                             'n_estimators': [10, 50, 100, 150]})

In [66]:
y_prob_rand_for_clf_long = rand_for_clf_long.predict_proba(X_test_trans)
roc_auc_rand_for_clf_long=roc_auc_score(y_test,y_prob_rand_for_clf_long[:,1])

In [67]:
roc_auc_rand_for_clf_long

0.5