### Imports

In [None]:
import pandas as pd

import os
import csv

from process_text import text_to_words, text_to_vectors

import gensim
from gensim.models import Doc2Vec

### 1. Read data

In [None]:
# Read admissions
admissions = pd.read_csv("data/source/admissions.csv", 
                         sep=";",
                         parse_dates=['start_datetime', 'end_datetime']
                        )

# Read incidents
incidents = pd.read_csv("data/source/incidents.csv", 
                        sep=";",
                        parse_dates=['datetime']
                       )

In [None]:
# Read notes
notes = pd.read_csv("data/source/notes.csv", 
                    sep=";", 
                    parse_dates=['datetime']
                   )

# Read trained paragraph2vec model
paragraph2vec_model = Doc2Vec.load('models/paragraph2vec_model')

### 2. Process incidents
We integrate incidents with admissions, based on a definition of a positive outcome: at least one violence incident after 24 hours of admission, and up to the first 28 days of admission.

In [None]:
# Inner join admissions and incidents
adm_incidents = admissions[['patient_id', 'start_datetime', 'admission_id']].merge(
    incidents[['patient_id', 'datetime', 'incident_id']], how='inner')

# Determine how much time between start of admission and each incident
adm_incidents['day_diff'] = (adm_incidents['datetime'] - adm_incidents['start_datetime']) 
adm_incidents['day_diff'] = adm_incidents['day_diff'] / pd.Timedelta("24 hour")

# Only retain incidents after the first 24 hours, and up to the first 28 days of admission
adm_incidents = adm_incidents[(adm_incidents['day_diff'] >= 1) & (adm_incidents['day_diff'] <= 28)]

# Group incidents for each admission, by simply taking the first if multiple are present
adm_incidents = adm_incidents.groupby("admission_id").first()
adm_incidents = adm_incidents.drop_duplicates()
adm_incidents = adm_incidents.reset_index()

# Merge this dataframe back to the original 
admissions = admissions.merge(adm_incidents[['admission_id', 'day_diff', 'incident_id']], how='left')

# Determine outcome (i.e. the day_diff variable is not empty)
admissions['outcome'] = admissions['day_diff'].notnull()
admissions['outcome'] = admissions['outcome'].map({False : 0, True : 1})

admissions['incident_id'] = admissions['incident_id'].fillna("")

### 3. Process notes

In order to select a relevant subset of notes, we integrate with admissions based on its start date. 

In [None]:
# Inner join admission info
notes = notes.merge(
    admissions[['patient_id', 'admission_id', 'start_datetime', 'transfer', 'outcome', 'incident_id']],
    how='inner',
    left_on='patient_id', 
    right_on='patient_id'
)

# Determine how much time between start of admission and each note
notes['day_diff'] = (notes['start_datetime'] - notes['datetime']) 
notes['day_diff'] = notes['day_diff'] / pd.Timedelta("24 hour")

# Determine a threshold for inclusion of retrospective notes (i.e. one week or four weeks)
notes['threshold'] = notes['transfer'].apply(lambda x : 7 if x else 28)

# Retain notes that are after the threshold, and before 24 hours have passed
notes = notes[(notes['day_diff'] <= notes['threshold'])]
notes = notes[(notes['day_diff'] > -1)]

For each admission, if multiple notes are present, they are concatenated.

In [None]:
# Concatenate multiple notes into a single text, add a newline character in between
notes_concat = notes.groupby("admission_id")['text'].agg(lambda x : "\n".join(x)).reset_index() # add

# Omit notes with fewer than 100 words
notes_concat['no_words'] = notes_concat['text'].apply(lambda x : len(x.split(" "))) 
notes_concat = notes_concat[notes_concat['no_words'] > 100]

In [None]:
# Check that there is only one gender per admission, so that we can keep it in the final file
for adm_id in notes_concat.admission_id.to_list():
    genders = set(notes[notes['admission_id'] == adm_id].Geslacht.to_list())
    if len(genders) != 1:
        raise Exception(adm_id, genders)

In [None]:
# Check that the number of words matches the input text
if 'aantal_woorden_x' in notes.columns:
    notes = notes.rename(columns = {'aantal_woorden_x': 'aantal_woorden'})
n_word_len_mismatch = len(notes[notes['text'].apply(lambda x: len(x.split(' '))) != notes['aantal_woorden']])
if n_word_len_mismatch != 0:
    raise Exception(n_word_len_mismatch)

In [None]:
def get_gender(dataframe):
    gender = dataframe.groupby('admission_id')['Geslacht'].agg(lambda x: x.to_list()[0])
    return gender

def merge_series_with_aggreggated_notes(agg, a_series):
    return agg.merge(a_series, on = 'admission_id', how = 'left')

In [None]:
# Check that these functions do what they are supposed to
test_df = pd.DataFrame()
test_df['Geslacht'] = ['Man', 'Vrouw', 'Vrouw', 'Man', 'Man']
test_df['admission_id'] = [1, 1, 7, 11, 11]
genders = get_gender(test_df)
if len(genders) != 3 or genders[1] != 'Man' or genders[7] != 'Vrouw' or genders[11] != 'Man':
    raise Exception('get_gender does not do what it is supposed to do')
agg = pd.DataFrame()
agg['admission_id'] = [1, 7, 13]
merged = merge_series_with_aggreggated_notes(agg, genders)

def get_gender_agg(adm_id):
    the_genders = merged[merged['admission_id'] == adm_id].Geslacht.to_list()
    if len(the_genders) != 1:
        raise Exception('Too many genders')
    return the_genders[0]

if len(merged) != 3 or get_gender_agg(1) != 'Man' or get_gender_agg(7) != 'Vrouw' or get_gender_agg(13) == get_gender_agg(13):
    raise Exception(merged)

In [None]:
agg_notes_by_adm = merge_series_with_aggreggated_notes(notes_concat, get_gender(notes))

In [None]:
if len(set(agg_notes_by_adm.Geslacht.to_list())) != 2:
    raise Exception('Incorrect genders')

We want to also add the datetime of the first and last notes, for each admission

In [None]:
def get_min_and_max(a_list):
    if len(a_list) == 0:
        raise Exception('Cannot get min or max in empty list')
    if len(a_list) == 1:
        return (a_list[0], a_list[0])
    the_min = a_list[0]
    the_max = a_list[0]
    for el in a_list[1:]:
        if el > the_max:
            the_max = el
        if el < the_min:
            the_min = el
    return (the_min, the_max)

In [None]:
single_passed = get_min_and_max([3]) == (3, 3)
two_passed = get_min_and_max([3, 2]) == (2, 3)
three_passed = get_min_and_max([3, 1, 2]) == (1, 3)
if not single_passed or not two_passed or not three_passed:
    raise Exception('Wrong min or max')

In [None]:
def get_note_datetimes(dataframe):
    datetimes = dataframe.groupby('admission_id')['datetime'].agg(lambda x: get_min_and_max(x.to_list()))
    return datetimes

In [None]:
# Check that this functions does what it is supposed to
test_df = pd.DataFrame()
timestamps = ['2020-04-01 12:59', '2020-03-19 13:00', '2020-02-13 11:00', '2020-04-01 11:00', '2020-04-02 11:00', '2020-03-30 11:00']
test_df['datetime'] = [pd.Timestamp(el) for el in timestamps]
test_df['admission_id'] = [1, 1, 7, 11, 11, 11]
note_datetimes = get_note_datetimes(test_df)
passed_len = len(note_datetimes) == 3
def tuple_matches_expected(admission_id, left, right):
    the_tuple = note_datetimes[admission_id]
    return the_tuple[0].day == left and the_tuple[1].day == right
passed_1 = tuple_matches_expected(1, 19, 1)
passed_2 = tuple_matches_expected(7, 13, 13)
passed_3 = tuple_matches_expected(11, 30, 2)
if not passed_len or not passed_1 or not passed_2 or not passed_3:
    raise Exception(passed_len, passed_1, passed_2, passed_3)

In [None]:
datetimes = get_note_datetimes(notes)

In [None]:
with_datetimes = merge_series_with_aggreggated_notes(agg_notes_by_adm, datetimes)
with_datetimes['first_note_datetime'] = with_datetimes['datetime'].apply(lambda x: x[0])
with_datetimes['last_note_datetime'] = with_datetimes['datetime'].apply(lambda x: x[1])
with_datetimes = with_datetimes.drop(['datetime'], 1)

In [None]:
with_datetimes.to_csv('data/processed/notes_without_vectors.csv', sep = ';', index=False, quoting=csv.QUOTE_ALL)

Add a vector representation, by first converting text into words (with additional stemming), and then using a paragraph2vec model to obtain vectors.

In [None]:
notes_concat = with_datetimes

# Convert text to words
notes_concat['words_stemmed'] = notes_concat['text'].apply(lambda x : text_to_words(x, 
                                                                                    filter_stopwords=True,
                                                                                    stemming=True,
                                                                                    filter_periods=True
                                                                                    ))

# Join with whitespace
notes_concat['words_stemmed'] = notes_concat['words_stemmed'].apply(lambda x : ' '.join(x))

notes_concat = notes_concat.reset_index()

In [None]:
# Convert text to vectors
note_vectors = text_to_vectors(notes_concat, 'words_stemmed', paragraph2vec_model, 10)

In [None]:
# Concatenate to original dataframe
notes_concat = pd.concat([notes_concat, pd.DataFrame(note_vectors)], axis=1)

Finally merge the patient_id and outcome from the `admission` table. 

In [None]:
# Merge outcome from admission table
notes_concat = notes_concat.merge(admissions[['outcome', 'admission_id', 'patient_id']])

# Write processed data to file for other notebooks
notes_concat.to_csv("data/processed/notes.csv", 
                    sep=";", 
                    index=False, 
                    quoting=csv.QUOTE_ALL)

### 4. Descriptive statistics of dataset
Now that source files have been integrated, we can print some descriptive statistics of the dataset. 

In [None]:
# Compute length of stay (days)
admissions['length'] = (admissions['end_datetime'].dt.date - admissions['start_datetime'].dt.date) 
admissions['length'] = admissions['length'] / pd.Timedelta("1 day")

print("Number of admissions = {}".format(len(admissions)))
print("Number of unique patients = {}".format(admissions['patient_id'].nunique()))
print("Median length of admission = {}".format(admissions['length'].median()))
print("Admissions with positive outcome = {:.2f}%".format(100 * admissions['outcome'].mean()))
print("Median number of words in notes = {}".format(notes_concat['no_words'].median()))

Compute some additional statistics for incidents by integrating them with admissions

In [None]:
adminc = admissions.merge(incidents, left_on='patient_id', right_on='patient_id')
adminc = adminc[(adminc['start_datetime'] <= adminc['datetime']) & (adminc['end_datetime'] >= adminc['datetime'])]
adminc['days_after_admission'] = (adminc['datetime'] - adminc['start_datetime']) / pd.Timedelta('1 day')

print("Number of incidents during admission = {} ".format(len(adminc)))
print("Number of incidents within 28 days = {}".format(sum(adminc['days_after_admission'] <= 28)))
print("Number of incidents within 24 hours = {}".format(sum(adminc['days_after_admission'] <= 1)))

In [None]:
print("Number of incidents within 1 week = {}".format(sum(adminc['days_after_admission'] <= 7)))

Maybe because these numbers are small, Vincent carried on with a 28-day cut. I think what Marco is suggesting is that now that we can remove the cut on the end date of the data, we might find more incidents and therefore a bigger training sample.