# Config

In [1]:
file_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/bq-results-20231215-074521-1702626519432.csv"
output_folder_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/processed_set"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# import package and load the data

In [3]:
import numpy as np
import torch
import os
import pandas as pd
import string
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

In [4]:
X_train = pd.read_csv(os.path.join(output_folder_path, 'X_train_raw.csv'))
X_test = pd.read_csv(os.path.join(output_folder_path, 'X_test_raw.csv'))
X_val = pd.read_csv(os.path.join(output_folder_path, 'X_val_raw.csv'))

X_train.drop_duplicates(subset=['icustay_id'])['icustay_id']

0         200006
12        200009
24        200014
36        200025
48        200028
           ...  
248736    299981
248748    299984
248760    299986
248772    299988
248784    299995
Name: icustay_id, Length: 20733, dtype: int64

In [5]:
X_train_tfidf = X_train[['icustay_id', 'slice_start', 'processed_text']]
X_test_tfidf = X_test[['icustay_id', 'slice_start', 'processed_text']]
X_val_tfidf = X_val[['icustay_id', 'slice_start', 'processed_text']]
X_train_tfidf

Unnamed: 0,icustay_id,slice_start,processed_text
0,200006,2159-09-03 11:28:14,missing
1,200006,2159-09-03 13:28:14,missing
2,200006,2159-09-03 15:28:14,missing
3,200006,2159-09-03 17:28:14,hospital ward name 4 icu nursing admitprogress...
4,200006,2159-09-03 19:28:14,patient admitted hospital ward name 6 ett stom...
...,...,...,...
248791,299995,2116-03-05 07:44:39,respiratory care pt extubated today 0745 post ...
248792,299995,2116-03-05 09:44:39,missing
248793,299995,2116-03-05 11:44:39,missing
248794,299995,2116-03-05 13:44:39,missing


In [6]:
X_train_tfidf['processed_text'] = X_train_tfidf['processed_text'].fillna('missing')
X_test_tfidf['processed_text'] = X_test_tfidf['processed_text'].fillna('missing')
X_val_tfidf['processed_text'] = X_val_tfidf['processed_text'].fillna('missing')
X_train_tfidf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_tfidf['processed_text'] = X_train_tfidf['processed_text'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_tfidf['processed_text'] = X_test_tfidf['processed_text'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val_tfidf['processed_text'] = X_val_t

Unnamed: 0,icustay_id,slice_start,processed_text
0,200006,2159-09-03 11:28:14,missing
1,200006,2159-09-03 13:28:14,missing
2,200006,2159-09-03 15:28:14,missing
3,200006,2159-09-03 17:28:14,hospital ward name 4 icu nursing admitprogress...
4,200006,2159-09-03 19:28:14,patient admitted hospital ward name 6 ett stom...
...,...,...,...
248791,299995,2116-03-05 07:44:39,respiratory care pt extubated today 0745 post ...
248792,299995,2116-03-05 09:44:39,missing
248793,299995,2116-03-05 11:44:39,missing
248794,299995,2116-03-05 13:44:39,missing


In [7]:
# null_rows = X_train_tfidf_bert['processed_text'].isnull()
# null_indices = null_rows[null_rows].index

# print("Indices of rows with null 'processed_text':")
# print(null_indices)

In [8]:
# # Inspect row 34179 of the 'processed_text' column in X_train
# row_index = 34179
# suspect_text = X_train.iloc[row_index]

# print("Content at row 34179:")
# print(suspect_text)
# print("Type of the content:", type(suspect_text))


# TF-IDF embedding

In [9]:
def create_tfidf_features(df, column_name, max_features):
    """
    Apply TF-IDF vectorization to a specified column in the DataFrame and add the result as a new column.

    Args:
    df (pd.DataFrame): DataFrame containing the text data.
    column_name (str): Name of the column to vectorize.
    max_features (int): Maximum number of features to create with TF-IDF.

    Returns:
    pd.DataFrame: DataFrame with the new TF-IDF dense column.
    """
    # Initialize TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=max_features)

    # Fit and transform the specified text column
    tfidf_matrix = vectorizer.fit_transform(df[column_name])

    # Convert the TF-IDF sparse matrix to a dense format
    dense_tfidf = tfidf_matrix.todense()

    return dense_tfidf.tolist()

In [10]:
X_train_tfidf['tfidf_dense'] = create_tfidf_features(X_train_tfidf, 'processed_text', 20)
X_val_tfidf['tfidf_dense'] = create_tfidf_features(X_val_tfidf, 'processed_text', 20)
X_test_tfidf['tfidf_dense'] = create_tfidf_features(X_test_tfidf, 'processed_text', 20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_tfidf['tfidf_dense'] = create_tfidf_features(X_train_tfidf, 'processed_text', 20)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val_tfidf['tfidf_dense'] = create_tfidf_features(X_val_tfidf, 'processed_text', 20)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_tfidf['tfidf_dense'] =

In [11]:
X_train_tfidf = X_train_tfidf.drop(['processed_text'], axis=1)
X_test_tfidf = X_test_tfidf.drop(['processed_text'], axis=1)
X_val_tfidf = X_val_tfidf.drop(['processed_text'], axis=1)

# Save training, testing and validation set to folder

In [None]:
# Save the datasets
X_train_tfidf.to_csv(os.path.join(output_folder_path, 'X_train_tfidf.csv'), index=False)
X_test_tfidf.to_csv(os.path.join(output_folder_path, 'X_test_tfidf.csv'), index=False)
X_val_tfidf.to_csv(os.path.join(output_folder_path, 'X_val_tfidf.csv'), index=False)

In [None]:
X_train_tfidf

Unnamed: 0,icustay_id,slice_start,tfidf_dense
0,200006,2159-09-03 11:28:14,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,200006,2159-09-03 13:28:14,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,200006,2159-09-03 15:28:14,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,200006,2159-09-03 17:28:14,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2487702556589..."
4,200006,2159-09-03 19:28:14,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
248791,299995,2116-03-05 07:44:39,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
248792,299995,2116-03-05 09:44:39,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
248793,299995,2116-03-05 11:44:39,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
248794,299995,2116-03-05 13:44:39,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [14]:
import pickle
# Save X_train_tfidf
with open(os.path.join(output_folder_path, 'X_train_tfidf.pkl'), 'wb') as file:
    pickle.dump(X_train_tfidf, file)

# Save X_test_tfidf
with open(os.path.join(output_folder_path, 'X_test_tfidf.pkl'), 'wb') as file:
    pickle.dump(X_test_tfidf, file)

# Save X_val_tfidf
with open(os.path.join(output_folder_path, 'X_val_tfidf.pkl'), 'wb') as file:
    pickle.dump(X_val_tfidf, file)