In [1]:
import numpy as np
import pandas as pd
from ipynb.fs.full.nlp import clean_document
from ipynb.fs.full.nlp import extract_and_combine 
from ipynb.fs.full.nlp import compute_readmitted_0
from ipynb.fs.full.nlp import compute_word_to_doc_class

# Tool functions for visualization

In [2]:
import math

def compute_scatter_points_position(word_to_doc_class, frequency_scale, largest_vocabulary, nb_documents):
    
    # Compute X=probabilty and Y=frequency scatter points
    probability = list()
    frequency = list()
    for doc_ids_0, doc_ids_1 in word_to_doc_class.values():
        
        totl = len(doc_ids_0) + len(doc_ids_1) 
        # Use log scale for better display
        frequency.append(math.log(1 + frequency_scale*totl/nb_documents))
        probability.append(len(doc_ids_1)/totl)  
    
    # Reshape and normalize
    probability = np.asarray(probability).reshape(len(probability),1)
    frequency = np.asarray(frequency).reshape(len(frequency),1)
    frequency = frequency / max(frequency)
    
    return probability, frequency

In [3]:
def compute_scatter_points_colors(largest_vocabulary, bow_vocabulary):
    
    # Compute fill and border colors of scatter points 
    color = list()
    border_color = list()
    doc_id = 0
    for word in largest_vocabulary:
        if word not in bow_vocabulary:
            color.append('#efefef')
            border_color.append('#%02x%02x%02x' % (128, 128, 128))
        else:
            color.append('#%02x%02x%02x' % (0, 200, 0))
            border_color.append('#%02x%02x%02x' % (0, 200, 0))

        doc_id += 1
                                  
    return color, border_color

In [4]:
# Bokeh Libraries
from bokeh.plotting import figure, show
from bokeh.io import output_file
from bokeh.models import ColumnDataSource, NumeralTickFormatter, Label, LabelSet

def animation_plot(frequency, probability, labels, color, border_color, per_readmitted_0):
    
    # Output to file
    output_file('visualization.html',
                title='visualization')

    # Store the data in a ColumnDataSource
    source = ColumnDataSource(data=dict(frequency = frequency,
                                        probability = probability,
                                        labels = labels,
                                        color = color,
                                        border_color = border_color))

    # Specify the selection tools to be made available
    select_tools = ["hover","crosshair","pan","wheel_zoom","zoom_in","zoom_out","box_zoom",
                    "undo","redo","reset","tap,save","box_select","poly_select","lasso_select"]

    # Create the figure
    fig = figure(plot_height=800,
                 plot_width=1500,
                 x_axis_label="Number of documents classed 1 against all the documents in which the word appears",
                 y_axis_label="Percentage of documents in which word appears",
                 title='Animation for Word to Bag output with a display of the goof features',
                 toolbar_location='below',
                 tools=select_tools, 
                 tooltips="@labels",
                 background_fill_color='#efefef')

    # Add Legend 
    fig.circle(0.7, 0.8, legend="In NLP vocabulary", color='#%02x%02x%02x' % (0, 150, 0), 
               line_color = '#%02x%02x%02x' % (0, 150, 0), line_width = 2, fill_alpha=0.4)
    fig.circle(0.7, 0.6, legend="Not in NLP vocabulary", color='#efefef', 
               line_color = '#%02x%02x%02x' % (128, 128, 128), line_width = 2, fill_alpha=0.4)

    # add a line at proba 0.5
    fig.line([0.5, 0.5], [0, 1], line_width=3, color="red", line_dash='dashed')

    # add a line admitted 1
    fig.line([0.5, 1], [1-per_readmitted_0, 1-per_readmitted_0], line_width=3, color="red", line_dash='dashed')

    # add a line admitted 0
    fig.line([0, 0.5], [per_readmitted_0, per_readmitted_0], line_width=3, color="red", line_dash='dashed')

    # add a circle admitted 1
    fig.circle(1, 1-per_readmitted_0, size=20, color="red", alpha=0.3, line_color= "red", line_width=3)

    # add a circle admitted 0
    fig.circle(0, per_readmitted_0, size=20, color="red", alpha=0.3, line_color= "red", line_width=3)

    # add a line at proba 0.5
    fig.line([0.5, 0.5], [0, 1], line_width=3, color="blue", line_dash='dashed')

    # Format the y-axis tick labels as percentages
    fig.yaxis[0].formatter = NumeralTickFormatter(format='00.0%')

    # Add square representing each player
    fig.scatter(x='probability', radius=0.01,
               y='frequency',
               source=source,
               fill_color='color',
               selection_color='deepskyblue',
               line_color = 'border_color',
               line_width = 2,
               fill_alpha=0.4)
    # Visualize
    show(fig)

# Other tool function

In [5]:
from csv import reader

def load_csv_file(filename, is_numpy):
    
    if is_numpy:
        return np.genfromtxt(filename, delimiter=',')
    else:
        with open(filename, 'r') as read_obj:
            csv_reader = reader(read_obj)
            return list(csv_reader)[0]

# Main visualization function 

In [6]:
def visualization(bow_vocabulary_filename, bow_score_filename, data, min_word_length, frequency_scale,
                 radius_scale, display_threshold_0, display_threshold_1):
    
    # Get the vocabulary and the scores used by word to bags 
    bow_vocabulary = load_csv_file(bow_vocabulary_filename, False)
            
    # Get the training scores of word to bag 
    bow_scores = load_csv_file(bow_score_filename, True)
    
    # Compute percentage readmitted
    readmitted_0 = compute_readmitted_0(data)
    
    # Get the corpus 
    corpus = extract_and_combine(data, 0)
    
    # Number of ducuments 
    nb_documents = len(corpus)
    
    # Compute the largest possible vocabulary possibly used, 
    # with its word frequencies (max 1 count per document)
    word_to_doc_class = compute_word_to_doc_class(corpus, min_word_length, readmitted_0)
    largest_vocabulary = list(word_to_doc_class.keys())
           
    # Compute how efficient is each possible word = scatter_point
    (probability, frequency) = compute_scatter_points_position(word_to_doc_class, 
                                                               frequency_scale, largest_vocabulary, nb_documents)
    
    # Colors scatter points depending on largest_vocabulary
    (color, border_color) = compute_scatter_points_colors(largest_vocabulary, bow_vocabulary)
    
    # Animation plot of the comparison of the two vocabularies
    percentage_readmitted_0 = 1.0*len(readmitted_0)/nb_documents
    animation_plot(frequency, probability, largest_vocabulary,color, border_color, percentage_readmitted_0)