## import packages

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_recall_fscore_support

from matplotlib import pyplot as plt
from IPython.display import clear_output
from sklearn.metrics import classification_report
import keras
from keras.models import load_model
import xgboost, numpy, string, pandas
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
# import preproce_text 
import pandas as pd
from imblearn.over_sampling import RandomOverSampler 
import numpy as np
import pickle
import sklearn

## feature selection class

In [None]:
class FeatureSelect:
    def __init__(self, train_x, test_x, trainDF):
        self.train_x = train_x
        self.test_x = test_x
        self.trainDF = trainDF
    
    def numpy_fillna(self, data):
        # Get lengths of each row of data
        data = data.toarray()
#         for i in data:
#             print(i.shape)
        lens = numpy.array([len(i) for i in data])
#         print(lens)

        # Mask of valid places in each row
        mask = numpy.arange(lens.max()) < lens[:,None]

        # Setup output array and put elements from data into masked positions
        out = numpy.zeros(mask.shape, dtype=data.dtype)
        out[mask] = numpy.concatenate(data)
        return out
    
    def count_vectors(self):
        # create a count vectorizer object
        count_vect = CountVectorizer(analyzer="word", token_pattern=r'\w{1,}') 
        count_vect.fit(self.trainDF)
        
        # transform the training and validation data using count vectorizer object
        xtrain_count = count_vect.transform(self.train_x)
        xtest_count = count_vect.transform(self.test_x)
        
        return xtrain_count, xtest_count
    
    def tf_idf(self, select):
        if (select=="word"):
            tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=500)
            tfidf_vect.fit(self.trainDF)
            xtrain_tfidf =  tfidf_vect.transform(self.train_x)
            xtest_tfidf =  tfidf_vect.transform(self.test_x)
            
#             xtrain_tfidf = self.numpy_fillna(xtrain_tfidf)
#             xtest_tfidf = self.numpy_fillna(xtest_tfidf)
            
            return xtrain_tfidf, xtest_tfidf
        elif (select=="ngram"):
            tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=500)
            tfidf_vect_ngram.fit(self.trainDF)
            xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(self.train_x)
#             xtest_tfidf_ngram =  tfidf_vect_ngram.transform(self.test_x)

            return xtrain_tfidf_ngram           
        elif (select=="characters"):
            tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=500)
            tfidf_vect_ngram_chars.fit(self.trainDF)
            xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(self.train_x) 
            xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(self.test_x)
            
            return xtrain_tfidf_ngram_chars, xvalid_tfidf_ngram_chars 
        
    def word_embedding(self):
        # load the pre-trained word-embedding vectors 
        embeddings_index = {}
        for i, line in enumerate(open('./glove.twitter.27B.100d.txt')):
            values = line.split()
            embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

        # create a tokenizer 
        token = text.Tokenizer()
        token.fit_on_texts(self.trainDF)
        word_index = token.word_index

        # convert text to sequence of tokens and pad them to ensure equal length vectors 
        train_seq_x = sequence.pad_sequences(token.texts_to_sequences(self.train_x), maxlen=500)
        valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(self.test_x), maxlen=500)

        # create token-embedding mapping
        embedding_matrix = numpy.zeros((len(word_index) + 1, 100))
        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        
        return train_seq_x, valid_seq_x, embedding_matrix, word_index
    def nlp_based(self):
        self.trainDF['char_count'] = self.trainDF['text'].apply(len)
        self.trainDF['word_count'] = self.trainDF['text'].apply(lambda x: len(x.split()))
        self.trainDF['word_density'] = self.trainDF['char_count'] / (trainDF['word_count']+1)
        self.trainDF['punctuation_count'] = self.trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
        self.trainDF['title_word_count'] = self.trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
        self.trainDF['upper_case_word_count'] = self.trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
        pos_family = {
            'noun' : ['NN','NNS','NNP','NNPS'],
            'pron' : ['PRP','PRP$','WP','WP$'],
            'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
            'adj' :  ['JJ','JJR','JJS'],
            'adv' : ['RB','RBR','RBS','WRB']
        }

        # function to check and get the part of speech tag count of a words in a given sentence
        def check_pos_tag(x, flag):
            cnt = 0
            try:
                wiki = textblob.TextBlob(x)
                for tup in wiki.tags:
                    ppo = list(tup)[1]
                    if ppo in pos_family[flag]:
                        cnt += 1
            except:
                pass
            return cnt

        self.trainDF['noun_count'] = self.trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
        self.trainDF['verb_count'] = self.trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
        self.trainDF['adj_count'] = self.trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
        self.trainDF['adv_count'] = self.trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
        self.trainDF['pron_count'] = self.trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))
        
    def lda(self):
        # train a LDA Model
        lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
        X_topics = lda_model.fit_transform(xtrain_count)
        topic_word = lda_model.components_ 
        vocab = count_vect.get_feature_names()

        # view the topic models
        n_top_words = 10
        topic_summaries = []
        for i, topic_dist in enumerate(topic_word):
            topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
            topic_summaries.append(' '.join(topic_words))


## machine learning & deeplearning class

In [None]:
class ModelSelect:
    def __init__(self, xtrain_x, xtest_x, ytrain_y, ytest_y, vector_name=None):
        self.xtrain_x = xtrain_x
        self.xtest_x = xtest_x
        self.ytrain_y = ytrain_y
        self.ytest_y = ytest_y
        self.vector_name = vector_name

    def train_model(self, classifier, is_neural_net=False, GB=False, epochs=None):
        # fit the training dataset on the classifier
        if GB:
            classifier.fit(self.xtrain_x.tocsc(), self.ytrain_y)
        elif is_neural_net:
            transfered_train_y = pandas.get_dummies(self.ytrain_y).values
            transfered_test_y = pandas.get_dummies(self.ytest_y).values
            classifier.fit(self.xtrain_x, 
                           transfered_train_y, 
                           epochs=epochs, 
                           batch_size=32,
                          callbacks=[plot_losses],
                          validation_data=(self.xtest_x, transfered_test_y))
            predictions = classifier.predict(self.xtest_x)
            class_labels = numpy.argmax(predictions, axis=1)
    #             acc = metrics.accuracy_score(class_labels, self.ytest_y)
    #             precision_recall_fscore_support(self.ytest_y, class_labels, average='macro')
    #             transfered_test = pandas.get_dummies(self.ytest_y).values
    #             classifier.evaluate(self.xtest_x, transfered_test, verbose=0)
            return precision_recall_fscore_support(self.ytest_y, class_labels, average='weighted'), classifier

        else:
            classifier.fit(self.xtrain_x, self.ytrain_y)

        # predict the labels on validation dataset
        if GB:
            predictions = classifier.predict(self.xtest_x.tocsc())
        else:
            predictions = classifier.predict(self.xtest_x)

        return precision_recall_fscore_support(self.ytest_y, predictions, average='weighted'), classifier

    def nb(self):
        accuracy, classifier = self.train_model(naive_bayes.MultinomialNB())
        return accuracy, classifier

    def linear_reg(self):
        accuracy, classifier = self.train_model(linear_model.LogisticRegression())
        return accuracy, classifier

    def svm(self):
        accuracy, classifier = self.train_model(svm.SVC())
        return accuracy, classifier

    def random_forest(self):
        accuracy, classifier = self.train_model(ensemble.RandomForestClassifier())
        return accuracy, classifier

    def gradient_boost(self):
        accuracy, classifier = self.train_model(xgboost.XGBClassifier(), GB=True)
        return accuracy, classifier

    def create_model_architecture(self, epochs=None):
        # create input layer 
        input_layer = layers.Input((numpy.size(self.xtrain_x, 1), ), sparse=True)

        # create hidden layer
        hidden_layer = layers.Dense(100, activation="relu")(input_layer)

        # create output layer
        output_layer = layers.Dense(4, activation="softmax")(hidden_layer)

        classifier = models.Model(inputs = input_layer, outputs = output_layer)
        classifier.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

        accuracy = self.train_model(classifier, is_neural_net=True, epochs=epochs)

        return accuracy, classifier

    def create_cnn(self, epochs=None, embedding_matrix=None, word_index=None, optimizer='rmsprop'):
        # Add an Input Layer
        input_layer = layers.Input((numpy.size(self.xtrain_x, 1), ))

        # Add the word embedding Layer
        embedding_layer = layers.Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], trainable=False)(input_layer)
        embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

        # Add the convolutional Layer
        conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

        # Add the pooling Layer
        pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

        # Add the output Layers
        output_layer1 = layers.Dense(64, activation="relu")(pooling_layer)
        output_layer1 = layers.Dropout(0.2)(output_layer1)
        output_layer2 = layers.Dense(2, activation="softmax")(output_layer1)

        # Compile the model
        model = models.Model(inputs=input_layer, outputs=output_layer2)
        model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

        accuracy = self.train_model(model, is_neural_net=True, epochs=epochs)


        return accuracy, model

    def create_rnn_lstm(self, epochs=None, embedding_matrix=None, word_index=None, optimizer=None):
        # Add an Input Layer
        input_layer = layers.Input((numpy.size(self.xtrain_x, 1), ))

        # Add the word embedding Layer
        embedding_layer = layers.Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], trainable=False)(input_layer)
        embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

        # Add the LSTM Layer
        lstm_layer = layers.LSTM(128)(embedding_layer)

        # Add the output Layers
        output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
        output_layer1 = layers.Dropout(0.25)(output_layer1)
        output_layer2 = layers.Dense(3, activation="softmax")(output_layer1)

        # Compile the model
        model = models.Model(inputs=input_layer, outputs=output_layer2)
        model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

        accuracy = self.train_model(model, is_neural_net=True, epochs=epochs)

        return accuracy, model

    def create_rnn_gru(self, epochs=None, embedding_matrix=None, word_index=None, optimizer=None):
        # Add an Input Layer
        input_layer = layers.Input((numpy.size(self.xtrain_x, 1), ))

        # Add the word embedding Layer
        embedding_layer = layers.Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], trainable=False)(input_layer)
        embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

        # Add the GRU Layer
        lstm_layer = layers.GRU(100)(embedding_layer)

        # Add the output Layers
        output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
        output_layer1 = layers.Dropout(0.25)(output_layer1)
        output_layer2 = layers.Dense(4, activation="softmax")(output_layer1)

        # Compile the model
        model = models.Model(inputs=input_layer, outputs=output_layer2)
        model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

        accuracy = self.train_model(model, is_neural_net=True, epochs=epochs)

        return accuracy, model

    def create_bidirectional_rnn(self, epochs=None, embedding_matrix=None, word_index=None, optimizer=None):
        # Add an Input Layer
        input_layer = layers.Input((numpy.size(self.xtrain_x, 1), ))

        # Add the word embedding Layer
        embedding_layer = layers.Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], trainable=False)(input_layer)
        embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

        # Add the LSTM Layer
        lstm_layer = layers.Bidirectional(layers.GRU(128))(embedding_layer)

        # Add the output Layers
        output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
        output_layer1 = layers.Dropout(0.25)(output_layer1)
        output_layer2 = layers.Dense(4, activation="softmax")(output_layer1)

        # Compile the model
        model = models.Model(inputs=input_layer, outputs=output_layer2)
        model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

        accuracy = self.train_model(model, is_neural_net=True, epochs=epochs)

        return accuracy, model

    def create_rcnn(self, epochs=None, embedding_matrix=None, word_index=None, optimizer=None):
        # Add an Input Layer
        input_layer = layers.Input((numpy.size(self.xtrain_x, 1), ))

        # Add the word embedding Layer
        embedding_layer = layers.Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], trainable=False)(input_layer)
        embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

        # Add the recurrent layer
        rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)

        # Add the convolutional Layer
        conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

        # Add the pooling Layer
        pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

        # Add the output Layers
        output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
        output_layer1 = layers.Dropout(0.25)(output_layer1)
        output_layer2 = layers.Dense(4, activation="softmax")(output_layer1)

        # Compile the model
        model = models.Model(inputs=input_layer, outputs=output_layer2)
        model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

        accuracy = self.train_model(model, is_neural_net=True, epochs=epochs)

        return accuracy, model

## utilization tools

In [None]:
class PlotLosses(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.val_losses = []
        
        self.fig = plt.figure()
        
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.i += 1
        
        clear_output(wait=True)
        plt.plot(self.x, self.losses, label="loss")
        plt.plot(self.x, self.val_losses, label="val_loss")
        plt.legend()
        plt.show();
        
def preprocess_text(row):
    return preproce_text.sanitize(str(row['clean_text']))

def trans_row(row):
    return str(row['text'])

def train_test_split_(inputfile):
    # read train-test data from a file
    train_test_data = pd.read_csv(inputfile, target_size, output)
    grouped = train_test_data.groupby('label')
    
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()

    sample_size = target_size/len(train_test_data)

    for key, group in grouped:
        train_sample = group.sample(int(len(group)*sample_size))
        train_data = train_data.append(train_sample)
        
    if len(train_data) < target_size:
        train_data = train_data.append(train_data.sample(target_size-len(train_data)))
        
    train_data.to_csv(output, index=False)


## classifier Demo

In [None]:
corpus = pd.read_excel("./palliative_care_annotation.xlsx")

## text preprocessing

In [None]:
texts = corpus["text"].tolist()
labels = corpus["label"].tolist()

plot_losses = PlotLosses()


trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels
trainDF['text_str'] = trainDF.apply(trans_row, axis=1)

featureSelect = FeatureSelect(trainDF['text_str'], trainDF['text_str'], trainDF["text_str"])
xtrain_word = featureSelect.tf_idf(select="ngram")

## train test split

In [None]:
xtrain_word_old, xtest_word, train_y, test_y = model_selection.train_test_split(xtrain_word, trainDF_label, stratify=trainDF_label, test_size=0.2)    
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

## train models

In [None]:
modelSelect_word = ModelSelect(xtrain_word_new, xtest_word, ml_train_y, test_y)
model_accuracy, model = modelSelect_word.linear_reg()