### Imports

In [None]:
import pandas as pd

import os
import csv

from process_text import text_to_words, text_to_vectors

import gensim
from gensim.models import Doc2Vec

### 1. Read data

In [None]:
# Read admissions
admissions = pd.read_csv("data/source/admissions.csv", 
                         sep=";",
                         parse_dates=['start_datetime', 'end_datetime']
                        )

# Read incidents
incidents = pd.read_csv("data/source/incidents.csv", 
                        sep=";",
                        parse_dates=['datetime']
                       )

# Read notes
notes = pd.read_csv("data/source/notes.csv", 
                    sep=";", 
                    parse_dates=['datetime']
                   )

# Read trained paragraph2vec model
paragraph2vec_model = Doc2Vec.load('models/paragraph2vec_model')

### 2. Process incidents
We integrate incidents with admissions, based on a definition of a positive outcome: at least one violence incident after 24 hours of admission, and up to the first 28 days of admission.

In [None]:
# Inner join admissions and incidents
adm_incidents = admissions[['patient_id', 'start_datetime', 'admission_id']].merge(
    incidents[['patient_id', 'datetime']], how='inner')

# Determine how much time between start of admission and each incident
adm_incidents['day_diff'] = (adm_incidents['datetime'] - adm_incidents['start_datetime']) 
adm_incidents['day_diff'] = adm_incidents['day_diff'] / pd.Timedelta("24 hour")

# Only retain incidents after the first 24 hours, and up to the first 28 days of admission
adm_incidents = adm_incidents[(adm_incidents['day_diff'] >= 1) & (adm_incidents['day_diff'] <= 28)]

# Group incidents for each admission, by simply taking the first if multiple are present
adm_incidents = adm_incidents.groupby("admission_id").first()
adm_incidents = adm_incidents.drop_duplicates()
adm_incidents = adm_incidents.reset_index()

# Merge this dataframe back to the original 
admissions = admissions.merge(adm_incidents[['admission_id', 'day_diff']], how='left')

# Determine outcome (i.e. the day_diff variable is not empty)
admissions['outcome'] = admissions['day_diff'].notnull()
admissions['outcome'] = admissions['outcome'].map({False : 0, True : 1})

### 3. Process notes

In order to select a relevant subset of notes, we integrate with admissions based on its start date. 

In [None]:
# Inner join admission info
notes = notes.merge(
    admissions[['patient_id', 'admission_id', 'start_datetime', 'transfer', 'outcome']],
    how='inner',
    left_on='patient_id', 
    right_on='patient_id'
)

# Determine how much time between start of admission and each note
notes['day_diff'] = (notes['start_datetime'] - notes['datetime']) 
notes['day_diff'] = notes['day_diff'] / pd.Timedelta("24 hour")

# Determine a threshold for inclusion of retrospective notes (i.e. one week of four weeks)
notes['threshold'] = notes['transfer'].apply(lambda x : 7 if x else 28)

# Retain notes that are after the threshold, and before 24 hours have passed
notes = notes[(notes['day_diff'] <= notes['threshold'])]
notes = notes[(notes['day_diff'] > -1)]

For each admission, if multiple notes are present, they are concatenated.

In [None]:
# Concatenate multiple notes into a single text, add a newline character in between
notes_concat = notes.groupby("admission_id")['text'].agg(lambda x : "\n".join(x)).reset_index() # add

# Omit notes with fewer than 100 words
notes_concat['no_words'] = notes_concat['text'].apply(lambda x : len(x.split(" "))) 
notes_concat = notes_concat[notes_concat['no_words'] > 100]

Add a vector representation, by first converting text into words (with additional stemming), and then using a paragraph2vec model to obtain vectors.

In [None]:
# Convert text to words
notes_concat['words_stemmed'] = notes_concat['text'].apply(lambda x : text_to_words(x, 
                                                                                    filter_stopwords=True,
                                                                                    stemming=True,
                                                                                    filter_periods=True
                                                                                    ))

In [None]:
# Join with whitespace
notes_concat['words_stemmed'] = notes_concat['words_stemmed'].apply(lambda x : ' '.join(x))

In [None]:
notes_concat = notes_concat.reset_index()

In [None]:
# Convert text to notes
note_vectors = text_to_vectors(notes_concat, 'words_stemmed', paragraph2vec_model, 10)

In [None]:
# Concatenate to original dataframe
notes_concat = pd.concat([notes_concat, pd.DataFrame(note_vectors)], axis=1)

Finally merge the patient_id and outcome from the `admission` table. 

In [None]:
# Merge outcome from admission table
notes_concat = notes_concat.merge(admissions[['outcome', 'admission_id', 'patient_id']])

# Write processed data to file for other notebooks
notes_concat.to_csv("data/processed/notes.csv", 
                    sep=";", 
                    index=False, 
                    quoting=csv.QUOTE_ALL)

### 4. Descriptive statistics of dataset
Now that source files have been integrated, we can print some descriptive statistics of the dataset. 

In [None]:
# Compute length of stay (days)
admissions['length'] = (admissions['end_datetime'].dt.date - admissions['start_datetime'].dt.date) 
admissions['length'] = admissions['length'] / pd.Timedelta("1 day")

print("Number of admissions = {}".format(len(admissions)))
print("Number of unique patients = {}".format(admissions['patient_id'].nunique()))
print("Median length of admission = {}".format(admissions['length'].median()))
print("Admissions with positive outcome = {:.2f}%".format(100 * admissions['outcome'].mean()))
print("Median number of words in notes = {}".format(notes_concat['no_words'].median()))

Compute some additional statistics for incidents by integrating them with admissions

In [None]:
adminc = admissions.merge(incidents, left_on='patient_id', right_on='patient_id')
adminc = adminc[(adminc['start_datetime'] <= adminc['datetime']) & (adminc['end_datetime'] >= adminc['datetime'])]
adminc['days_after_admission'] = (adminc['datetime'] - adminc['start_datetime']) / pd.Timedelta('1 day')

print("Number of incidents during admission = {} ".format(len(adminc)))
print("Number of incidents within 28 days = {}".format(sum(adminc['days_after_admission'] <= 28)))
print("Number of incidents within 24 hours = {}".format(sum(adminc['days_after_admission'] <= 1)))