In [None]:
import pandas as pd
import os

from process_text import text_to_words
import TaggedDocumentStream

import gensim
from gensim.models import Doc2Vec

### 1. Read clinical notes
We use a large set of clinical notes in unprocessed form, with the following columns:
- `text`  The text
- `label` A label, required for training, can be arbitrary (i.e. a counter)

This should be read as a Pandas dataframe, from any arbitrary source (csv, excel, database, etc). 

In [None]:
notes = pd.read_csv('data/source/notes_full.csv')

### 2. Preprocess text
We preprocess by tokenizing, removing stopwords, stemming, and removing remaining punctuation. We then write the preprocessed text to `txt` file. This allows using a Python generator object for training the model, which is more memory-efficient.

In [None]:
# Processed notes and labels are written to two seperate files
notes_file_path = os.path.join('data', 'processed_notes', 'notes.txt')
label_file_path = os.path.join('data', 'processed_notes', 'labels.txt')

# Open file handles for preprocessed notes and lables
with open(notes_file_path, 'a+') as notes_file, open(label_file_path, 'a+') as label_file:

    # Iterate over records (== notes)
    for i in notes.index:

        # Extract text and label
        text = notes.loc[i]['text']
        label = notes.loc[i]['label']

        # Convert text to words
        words = text_to_words(text, 
                              filter_stopwords=True,
                              stemming=True,
                              filter_periods=True
                             )

        # Only texts with at least 2 words
        if len(words) <= 1:
            continue

        # Append to file 
        notes_file.write("{}\n".format(' '.join(words)))
        label_file.write("{}\n".format(label))

### 3. Train `paragraph2vec` model
We use a `TaggedDocumentStream` to read lines from training corpus, then train a `paragraph2vec` model (called `Doc2Vec` in the `gensim` implementation). 

In [None]:
# Define TaggedDocumentStream
notes_stream = TaggedDocumentStream(note_files=[note_file], label_files=[label_file])

# Train paragraph2vec model
pragraph2vec_model = Doc2Vec(all_td_stream, 
                             epochs=20,
                             min_count=20,
                             dm=1,
                             sample=1e-3,
                             vector_size=300, 
                             window=2,
)

# Save model to disk
pragraph2vec_model.save("models/paragraph2vec_model")