___
# Train word2vec/doc2vec
___

In [None]:
import pandas as pd
import re
import time
import spacy
import nlp_utils as utils
from custom_tokenizer import combined_rule_tokenizer
from gensim.models.doc2vec import Word2Vec, Doc2Vec, TaggedDocument

### Load raw data

In [None]:
# Load the dataset with ED triage notes
df = pd.read_csv("./data/rmh_data_prepared.csv")
df.head()

### Pre-process

In [None]:
def preprocess(text):
    
    # Convert to lower case
    text = text.lower()
    
    # "l)" to "left"
    pattern = re.compile("(^|\W)l\)")
    text = pattern.sub(r"\1 left ", text)
    # "r)" to "right"
    pattern = re.compile("(^|\W)r\)")
    text = pattern.sub(r"\1 right ", text)
    # "@" to "at"
    pattern = re.compile("@")
    text = pattern.sub(r" at ", text)
    # "#" to "fractured" if not followed by number
    pattern = re.compile("#(?!\d)")
    text = pattern.sub(r" fracture ", text)
    # "+ve" to "positive"
    pattern = re.compile("\+ve(?![a-z])")
    text = pattern.sub(r" positive ", text)
    # "-ve" to "positive"
    pattern = re.compile("\-ve(?![a-z])")
    text = pattern.sub(r" negative ", text)
    # Remove ? and !
    pattern = re.compile("\?|!")
    text = pattern.sub(r" ", text)
    # Arrows
    pattern = re.compile("-+>")
    text = pattern.sub(r" -> ", text)
    # Remove "+" after digit
    pattern = re.compile("(\d)\+")
    text = pattern.sub(r"\1 ", text)
    # Replace parentheses with commas
    pattern = re.compile("\((.*)\)[,\.]?")
    text = pattern.sub(r" , \1, ", text)
    # Replace curly brackets with dots
    pattern = re.compile("\((.*)\)")
    text = pattern.sub(r" . \1. ", text)    
    # Remove duplicated punctuation marks [-/+_,?.] and spaces
    pattern = re.compile("-{2,}")
    text = pattern.sub(r"-", text)
    pattern = re.compile("/{2,}")
    text = pattern.sub(r"/", text)
    pattern = re.compile("\+{2,}")
    text = pattern.sub(r"+", text)
    pattern = re.compile("_{2,}")
    text = pattern.sub(r"_", text)
    pattern = re.compile(",{2,}")
    text = pattern.sub(r",", text)  
    pattern = re.compile("\?{2,}")
    text = pattern.sub(r"?", text)
    pattern = re.compile("\.{2,}")
    text = pattern.sub(r".", text)
    pattern = re.compile("\s+")
    text = pattern.sub(r" ", text)
    
    return text

In [None]:
# Preprocess triage notes
df['text_clean'] = df.text.apply(preprocess)

### Tokenise

> Alternatively, can simply split by whitespace:
>
> `df.text_clean = df.text_clean.apply(lambda x: x.split())`

In [None]:
# Load scispacy model to apply custom tokeniser
# The rest of the pipeline can be disabled
nlp = spacy.load("en_core_sci_sm", disable=['tagger', 'parser', 'ner'])
nlp.tokenizer = combined_rule_tokenizer(nlp)

# Apply the NLP pipeline
df.text_clean = list(nlp.pipe(df.text_clean))

In [None]:
def doc2list(doc):
    tokens = []
    for token in doc:
        tokens.append(token.text)
    return tokens

# Convert spacy doc to list
df.text_clean = df.text_clean.apply(doc2list)

### Train and save the model
**Word2Vec**

In [None]:
model = Word2Vec(df.text_clean, min_count=1)
print(model)
model.save('./models/rmh_cleaned_w2v_model.bin')

**Doc2Vec**

In [None]:
tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(df.text_clean)]
model = Doc2Vec(tagged_docs, min_count=1)
print(model)
model.save('./models/rmh_cleaned_d2v_model.bin')

### Run the pre-trained model

In [None]:
def get_vectorizer(vectorizer_mode):
    """
    Call word2vec/doc2vec vectorizer.
    """
    if vectorizer_mode == "word2vec":
        model_path = "./models/rmh_cleaned_w2v_model.bin"
        return MeanEmbeddingVectorizer(model_path)
    elif vectorizer_mode == "doc2vec":
        model_path = "./models/rmh_cleaned_d2v_model.bin"
        return DocEmbeddingVectorizer(model_path)
    
    
class MeanEmbeddingVectorizer(object):
    """
    Class definition for the word2vec vectorizer.
    """
    def __init__(self, model_path):
        self.model_path = model_path
        self.word2vec = gensim.models.Word2Vec.load(model_path)
        self.dim = self.word2vec.wv.vectors[0].shape[0]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        tokenized_X = [doc.split() for doc in X]
                    
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in tokenized_X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    
    
class DocEmbeddingVectorizer(object):
    """
    Class definition for the doc2vec vectorizer.
    """
    def __init__(self, model_path):
        self.model_path = model_path
        self.doc2vec = gensim.models.Doc2Vec.load(model_path)
        self.dim = self.doc2vec.wv.vectors[0].shape[0]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        tokenized_X = [doc.split() for doc in X]
        return np.array([
            self.doc2vec.infer_vector(words) 
            for words in tokenized_X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)