In [1]:
import numpy as np
import pandas as pd

This jupyter notebook implements the NLP with Word to Bags. It is divided in two parts: the tool functions and the final main NLP function. The NLP can be done eather on each diagonistic (1,2 and 3) individually and then we combine the vectors or it first combines the text for each diagnostic (1,2 and 3) and then runs NLP. Different types of Word to Bags (simple or advanced) and vocabulary extraction (simple or advanced) can also be run with this NLP function.  

# Tool Functions for tokenizing,  stemming ...

In [2]:
def extract_and_combine(data, mode):
    
    # mode = 1,2,3 if want one to extract diag 1,2 or 3 separately 
    # and 0 if extract everything
        
    # Getting the text columns
    diag_1_txt = pd.DataFrame(data,columns=['diag_1_desc'])
    diag_2_txt = pd.DataFrame(data,columns=['diag_2_desc'])
    diag_3_txt = pd.DataFrame(data,columns=['diag_3_desc'])
    
    # Extracting text data in correct format
    corpus1 = diag_1_txt.diag_1_desc.values.astype('U')
    corpus2 = diag_2_txt.diag_2_desc.values.astype('U')
    corpus3 = diag_3_txt.diag_3_desc.values.astype('U')
       
    # Combining the (or not) corpus together
    corpus = list()
    for i in range(len(corpus1)):
        
        doc_corpus_1 = ""
        doc_corpus_3 = ""
        doc_corpus_2 = ""
        
        if mode == 1 or mode == 0:
            doc_corpus_1 = str(corpus1[i])
        if mode == 2 or mode == 0:
            doc_corpus_2 = str(corpus2[i])
        if mode == 3 or mode == 0:
            doc_corpus_3 = str(corpus3[i])
                
        corpus.append(doc_corpus_1 + " " + doc_corpus_2 + " " + doc_corpus_3)
    
    return corpus

In [3]:
from string import punctuation
from nltk.stem import PorterStemmer

def clean_document(document, min_word_length):
    
    # Split into tokens by white space
    tokens = document.split()
    
    # Remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    # Remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    
    # Filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    # Extract the stem of the words
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(w) for w in tokens]

    # Filter out short tokens
    tokens = [word for word in tokens if len(word) > min_word_length]
        
    return tokens

In [4]:
def filter_document_from_vocabulary(document, vocabulary, min_word_length):
    
    # Clean the document
    tokens = clean_document(document, min_word_length)
    
    # Keep only words that are in the vocabulary
    tokens = [w for w in tokens if w in vocabulary]
    
    return ' '.join(tokens)

In [5]:
def extract_and_filter_corpus_from_vocabulary(data, vocabulary, mode, min_word_length):
    
    # Extract corpus from data
    extracted_corpus = extract_and_combine(data, mode)
    
    # Filter all documents in corpus
    filtered_corpus = list()
    for document in extracted_corpus:
        filtered_corpus.append(filter_document_from_vocabulary(document, vocabulary, min_word_length))
        
    return filtered_corpus
        

# Tool functions for vocabulary extraction (simple or advanced)

In [6]:
from collections import Counter
from nltk.corpus import stopwords

def construct_simple_vocabulary(data, min_freq_occurence, max_freq_occurence, mode, min_word_length):
    
    # Vocabulary
    vocabulary = Counter()
    
    # Extract corpus from data
    corpus = extract_and_combine(data, mode)
    
    # Get vocabulary of all documents
    for document in corpus:
        vocabulary.update(clean_document(document, min_word_length))
    
    # Lenght of vocabulary
    voc_len = len(vocabulary.items())
        
    # Keep only most frequent words and delete too frequent words 
    vocabulary = [k for k,c in vocabulary.items() if (c/voc_len >= min_freq_occurence 
                                                      and c/voc_len <= max_freq_occurence)]
        
    return vocabulary


In [7]:
def construct_advanced_vocabulary(data, threshold_proba_0, threshold_proba_1, 
                                  min_freq_occurence, max_freq_occurence, mode, min_word_length):
    
    # Compute document readmitted = 0
    readmitted_0 = compute_readmitted_0(data)
    
    # Extract corpus from data
    corpus = extract_and_combine(data, mode)
    
    # Compute for each word the proportion of admitted = 0 and admitted = 1 and their occurences 
    word_to_doc_class = compute_word_to_doc_class(corpus, min_word_length, readmitted_0)
    
    # Compute probability = (#admitted=1)/(nb_doc) and frequency = pourcentage of occurence in each document
    (probability, frequency) = compute_probability_and_frequency(word_to_doc_class, len(corpus))
    
    # Choose the vocabulary depending on probability and frequency
    all_words = Counter()
    for document in corpus:
        all_words.update(clean_document(document, min_word_length))
    all_words = list(all_words.keys())
        
    vocabulary = list()
    for i in range(len(all_words)):
        
        potential_word = all_words[i]
               
        if probability[i]<0.5:
            add_potential_word = frequency[i] < max_freq_occurence and frequency[i] > min_freq_occurence and probability[i]<threshold_proba_0                     
        else:
            add_potential_word = frequency[i] < max_freq_occurence and frequency[i] > min_freq_occurence and probability[i]>threshold_proba_1    
        
        if add_potential_word:
            vocabulary.append(potential_word)
    
    return vocabulary

In [8]:
import math

def compute_probability_and_frequency(word_to_doc_class, nb_documents):
    
    # Compute X=probabilty and Y=frequency 
    probability = list()
    frequency = list()
    for doc_ids_0, doc_ids_1 in word_to_doc_class.values():
        
        totl = len(doc_ids_0) + len(doc_ids_1) 
        frequency.append(totl/nb_documents)
        probability.append(len(doc_ids_1)/totl)  
    
    # Reshape and normalize
    probability = np.asarray(probability).reshape(len(probability),1)
    frequency = np.asarray(frequency).reshape(len(frequency),1)
    frequency = frequency / max(frequency)
        
    return probability, frequency

In [9]:
def compute_word_to_doc_class(corpus, min_word_length, readmitted_0):
    
    # Dictionary: word -> documents ids it appears in (doc ids with class 0, doc ids with class 1)
    word_to_doc_class = dict()  
    doc_id = 0
    for document in corpus:
        for w in clean_document(document, min_word_length):
            if w not in word_to_doc_class:
                # The value equals: ((doc ids at 0), (doc ids at 1))
                word_to_doc_class[w] = (set(), set()) 
            
            (doc_ids_0, doc_ids_1) =  word_to_doc_class[w]
            if doc_id in readmitted_0:
                doc_ids_0.add(doc_id)
            else:
                doc_ids_1.add(doc_id)
        doc_id += 1
    
    return word_to_doc_class

In [10]:
def compute_readmitted_0(data):
    
    # Extract readmitted column
    readmitted = pd.DataFrame(data,columns=['readmitted']).to_numpy().flatten()
    
    # Extract classes
    readmitted_0 = np.argwhere(readmitted == 0).flatten()
    
    return readmitted_0

# Tool functions for Word to Bags (simple or advanced)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

def simple_bag_of_words(filtered_data, vocabulary, min_word_length):
    
    # Lenghts 
    nb_documents = len(filtered_data)
    nb_vocab_words = len(vocabulary)
    
    # Compute the occurence scores
    bag_of_words = np.zeros((nb_documents,nb_vocab_words))
    for i in range(nb_documents):
        document = filtered_data[i]
        occurence = Counter()
        occurence.update(clean_document(document, min_word_length))
        
        # Format the vector to the vocabulary
        for j in range(nb_vocab_words):
            word = vocabulary[j]
            if word in occurence.keys():
                bag_of_words[i,j] = 1
            else:
                bag_of_words[i,j] = 0
             
    return np.asarray(bag_of_words).reshape((nb_documents,nb_vocab_words))

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

def advanced_bag_of_words(filtered_data, global_vocabulary, max_ngram_size):
    
    # Number of documents 
    nb_documents = len(filtered_data)
    
    # Compute the TFIDF scores 
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,max_ngram_size))
    fitted_vectorizer = tfidf_vectorizer.fit(filtered_data)
    tfidf_vectorizer_vectors = fitted_vectorizer.transform(filtered_data).toarray()
    local_vocabulary = fitted_vectorizer.get_feature_names()
    
    # Format the vector to the vocabulary
    bag_of_words = np.zeros((nb_documents, 1))
    start = True 
    for word in global_vocabulary:
        if word in set(local_vocabulary):
            index_tfidf = local_vocabulary.index(word)
            score_tfidf = np.asarray(tfidf_vectorizer_vectors[:,index_tfidf]).reshape(nb_documents, 1)
        else:
            score_tfidf = np.zeros((nb_documents, 1))
            
        if start:
            bag_of_words = score_tfidf
            start = False
        else:
            bag_of_words = np.concatenate((bag_of_words, score_tfidf), axis=1)
    
    return bag_of_words
    

# Other tool functions 

In [13]:
from csv import reader

def load_csv_file(filename, is_numpy):
    
    if is_numpy:
        return np.genfromtxt(filename, delimiter=',')
    else:
        with open(filename, 'r') as read_obj:
            csv_reader = reader(read_obj)
            return list(csv_reader)[0]

In [14]:
def combine_score(score_1, score_2, score_3):
    
    score = np.concatenate((score_1, score_2), axis=1)
    score = np.concatenate((score, score_3), axis=1)
    
    return score

In [15]:
import csv

def write_list_to_file(list_data, list_filename, is_numpy):
    
    if is_numpy:
        np.savetxt(list_filename, list_data, delimiter=",")
    else:
        with open(list_filename,'w', newline='') as myfile:
            wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
            wr.writerow(list_data)

In [16]:
def core_nlp(train_data, test_data, is_advanced_vocabulary, is_advanced_bag_of_words, min_freq_occurence, 
             max_freq_occurence, min_word_length, max_ngram_size, diag_number, 
             threshold_proba_0, threshold_proba_1):
    
    # Construc the vocubalary (simple or advanced)
    if is_advanced_vocabulary:
        # Construct advanced vocabulary using train data
        vocabulary = construct_advanced_vocabulary(train_data, threshold_proba_0, threshold_proba_1, min_freq_occurence, max_freq_occurence, diag_number, min_word_length)
        
    else:
        # Construct simple vocabulary using train data
        vocabulary = construct_simple_vocabulary(train_data, min_freq_occurence, max_freq_occurence, diag_number, min_word_length)

    # Use constructed vocabulary to filter the train and test data
    filtered_train_data = extract_and_filter_corpus_from_vocabulary(train_data, vocabulary, diag_number, min_word_length)
    filtered_test_data = extract_and_filter_corpus_from_vocabulary(test_data, vocabulary, diag_number, min_word_length)
    
    # Construct the train/test data using Bag of Words (simple or advanced)
    if is_advanced_bag_of_words:
        # Constructe the TFIDF vectors from filtered data using bag of words
        score_train_data = advanced_bag_of_words(filtered_train_data, vocabulary, max_ngram_size)
        score_test_data = advanced_bag_of_words(filtered_test_data, vocabulary, max_ngram_size)
    else:
        # Constructe simple Bag of Words
        score_train_data = simple_bag_of_words(filtered_train_data, vocabulary, min_word_length)
        score_test_data = simple_bag_of_words(filtered_test_data, vocabulary, min_word_length)
        
    
    return vocabulary, score_train_data, score_test_data

# NLP main function

The parameters of the NLP are:
    
"train_data" = The train dataset

"test_data" = The test dataset 

"min_freq_occurence" = Filters out words appearing less than "min_freq_occurence" (frequency between 0 and 1)

"max_freq_occurence" = Filters out words appearing less than "max_freq_occurence" (frequency between 0 and 1)

"train_data_filename" = Output train data file name 

"test_data_filename" = Output test data file name

"output_vocabulary_filename" = Output vocabulary file name 

"mode" = 0 if we take {diag_1, diag_2, diag_3} combined to do the NLP or 1,2 or 3 if make an NLP on each diag individually 

"min_word_length" = Filters out words less than "min_word_length"

"max_ngram_size" = Maximum size of the ngrams used in Word to Bags 

In [17]:
def nlp(train_data, test_data, min_freq_occurence, max_freq_occurence, train_data_filename,
                         test_data_filename, output_vocabulary_filename, 
                            is_corpus_combined, is_advanced_vocabulary, is_advanced_bag_of_words, min_word_length, max_ngram_size,
                               threshold_proba_0, threshold_proba_1):
    
    
    # Run NLP in 2 different ways: the diagnostic combined or not 
    if is_corpus_combined:
        # Run NLP on all data sets and get train/test scores
        (vocabulary, score_train_data, score_test_data) = core_nlp(train_data, test_data, is_advanced_vocabulary, is_advanced_bag_of_words, min_freq_occurence, max_freq_occurence, min_word_length, max_ngram_size, 0, threshold_proba_0, threshold_proba_1)
        
        # Load vocabulary memory 
        write_list_to_file(vocabulary, output_vocabulary_filename, False)
    
    else:
        
        # Run NLP on first diagnostic and get train/test scores
        (vocabulary_1, score_train_data_1, score_test_data_1) = core_nlp(train_data, test_data, is_advanced_vocabulary, is_advanced_bag_of_words, min_freq_occurence, max_freq_occurence, min_word_length, max_ngram_size, 1, threshold_proba_0, threshold_proba_1)
        # Run NLP on second diagnostic and get train/test scores
        (vocabulary_2, score_train_data_2, score_test_data_2) = core_nlp(train_data, test_data, is_advanced_vocabulary, is_advanced_bag_of_words, min_freq_occurence, max_freq_occurence, min_word_length, max_ngram_size, 2, threshold_proba_0, threshold_proba_1)
        # Run NLP on third diagnostic and get train/test scores
        (vocabular_3, score_train_data_3, score_test_data_3) = core_nlp(train_data, test_data, is_advanced_vocabulary, is_advanced_bag_of_words, min_freq_occurence, max_freq_occurence, min_word_length, max_ngram_size, 3, threshold_proba_0, threshold_proba_1)
        # Combine the obtained scores
        score_train_data = combine_score(score_train_data_1, score_train_data_2, score_train_data_3)
        score_test_data = combine_score(score_test_data_1, score_test_data_2, score_test_data_3)
            
        
    # Load train/test scores into memory 
    write_list_to_file(score_train_data, train_data_filename, True)
    write_list_to_file(score_test_data, test_data_filename, True)   
    
    # Check that output vectors have same length
    print("Shape of the train and test datasets")
    print(score_train_data.shape)
    print(score_test_data.shape)