In [2]:
from pathlib import Path
import requests
import pandas as pd
import numpy as np
import pandas_profiling
from pandas_profiling.model.base import Variable
import base64
from urllib.parse import unquote
from IPython.core.display import SVG, HTML, Markdown
from IPython.display import Markdown as md
import datetime

In [3]:
todays_date = datetime.datetime.now().strftime("%B %d, %Y")

In [4]:
# Load dataset. Fetch filename from environment variable.

with open('config','r') as f:
    data_src,input_filename, = f.read().strip().split(',')

import pickle
with open(input_filename, 'rb') as f:
    profile = pickle.load(f)

docref=input_filename.split('/')[-2]

rejected_variables = profile.get_rejected_variables(threshold=0.9)
description = profile.get_description()

In [5]:


CUSTOMER_NAME = "Onabora.org"
AUTH_NAME = "stephen@onabora.com"
HTML(f"""
<div class="jumbotron">
    <div class="container">
  <h1 class="display-3">Analysis Report</h1>
    <p class="lead">For {CUSTOMER_NAME} on {todays_date}.</p>
  <hr class="my-4">
  
<p class='title'>Analysis of the file [{data_src.split('/')[-1]}] by {AUTH_NAME}</p>
<p>Document ID: {docref}</p>

</div>
</div>

""")


In [6]:
HTML("""
<div class="container">
<h2>Introduction</h2>
<p>Hello and thank you for sending your data for analysis. This report will help you prepare your dataset for machine learning algorithms and bring to your attention issues that could affect performance. </p>

<p>In the first part of the report, I will describe the data and the results of the analysis using a range of summary statistics. If you have been unable to examine the data yourself this will help you to understand what you are working with and whether any special computing resources will be needed.</p>

<p>After that, I step through any issues that have been identified during the analysis. For each one, I will explain why it may be important and what actions you should take next.</p>

<p>Finally, the technical appendix provides a detailed examination of each variable in the data in turn.</p>

<p>Solving some of the issues raised in this report will require detailed knowledge of how the data was generated or collected. I would be happy to help you dive deeper into this dataset with our tailored analysis service.</p>
</div>
""")


In [7]:
TL = {
    'NUM':'Numeric',
    'CAT':'Categorical',
    'BOOL':'Boolean',
    'DATE':'Date',
    'URL':'url',
    'UNIQUE':'Text (Unique)',
    'REJECTED':'Rejected',
    'UNSUPPORTED':'Unsupported'
    
}
    
summary_table = pd.DataFrame([{'name':v,'type':TL.get(description['variables'][v]['type'].value), 'missing':description['variables'][v]['n_missing']} for v in description['variables']]).sort_values(by='type')
summary_table.columns=['Column Heading','Variable Type','Number of Missing Observations']
def format_bytes(size):
    # 2**10 = 1024
    power = 2**10
    n = 0
    power_labels = {0 : '', 1: 'K', 2: 'M', 3: 'G', 4: 'T'}
    while size > power:
        size /= power
        n += 1
    return size, power_labels[n]+'B'

IN_MEM_SIZE=description['table']['memsize']
IN_MEM_SIZE_FMT = format_bytes(IN_MEM_SIZE)
N_OBS = description['table']['n']
N_VAR = description['table']['nvar']

if (IN_MEM_SIZE > 1e9):
    MEMORY_ADVICE = f"""
You may wish to consider strategies for handling large data such as: 
 - working first with a sample of the data
 - deploy cloud servers with large memory allocations
 - streaming or chunking the data
 - compress the data by relabelling categorical classes
 - using a specific big data platform.
"""
else:
    MEMORY_ADVICE = f"In most instances you should be able to work with this data without specialist big data computing resources."

missing_cells = f"{description['table']['n_cells_missing']:,}"

HTML(f"""
<div class="container">
<h2>Overview</h2>

<p>The data you provided contained {N_OBS:,} observations (rows) of {N_VAR:,} variables (columns).</p>
<p>When loaded into memory the data takes around {IN_MEM_SIZE_FMT[0]:0.1f} {IN_MEM_SIZE_FMT[1]}. {MEMORY_ADVICE}
However, the computing resources required can depend on which Machine Learning approach you intend to adopt. I would be happy to advise further on the resources that would be required on request.</p>
</div>

</div>
""")

In [8]:
HTML(f'''
<div class="container">
    <p>The following table gives a summary of the variable types that were detected in the data. The detected variable types are important for two reasons. </p>

    <p>Firstly, it may highlight errors in data collection or extraction. For example, if you know that observations of a particular variable should be numerical but the detected variable type is not numerical then it is likely that some part of the data collection or extraction process is writing spurious information. It's important to understand how this is happening before proceeding. </p>

    <p>Secondly, many Machine Learning algorithms only work with variables of a certain type. For example, Neural Networks require numerical variables as input. If you have, say, Categorical variables in your data you will have to decide how best to encode these into numerical variables.</p>
    
    <p>Go through the list below and check everything is as you would expect.</p>
    
    <div class='col-lg-6'>{summary_table[['Column Heading','Variable Type']].to_html(index=False,classes='table table-hover table-striped')} 
    <p class='caption'><em>Table 1: summary of variable types.</em></p>
    </div>
</div>
''')


Column Heading,Variable Type
education,Categorical
marital-status,Categorical
native-country,Categorical
occupation,Categorical
race,Categorical
relationship,Categorical
sex,Categorical
workclass,Categorical
age,Numeric
capital-gain,Numeric


In [9]:
HTML("""
<div class="container">

<p>It is also a useful sanity check to have a glance at the first few rows of the data which are shown in the table below. This exercise can catch mislabelling of columns or more serious errors in the data generation or collection operations.</p>

<p>Have a look at the names and values for each variable. Good questions to ask when looking at this extract are:

<ul>
<li>Are the columns those that you expected?</li>
<li>Do the values in those columns seem reasonable given the headings?</li>
<li>Are numerical values about the right order-of-magnitude for the quantity they are supposed to be describing?</li>
</ul>
</p>

<p>An important question to ask at this stage is whether all of the variables are relevant to the problem you are trying to solve? Although Machine Learning and deeper analysis can be used to determine which features in the dataset are most relevant, a lot of time and effort can be saved by using your domain knowledge to remove irrelevant observations at this stage. </p>

<div>
""")

In [10]:
HTML(f'''
<div class="container">
 {profile.sample['head'].to_html(index=False, classes='table table-hover table-striped')}
 <p class='caption'><em>Table 2: First few rows of data at a glance.</em></p>
</div>
''')

age,capital-gain,capital-loss,education,education-num,fnlwgt,hours-per-week,marital-status,native-country,occupation,race,relationship,sex,workclass
39,2174,0,Bachelors,13,77516,40,Never-married,United-States,Adm-clerical,White,Not-in-family,Male,State-gov
50,0,0,Bachelors,13,83311,13,Married-civ-spouse,United-States,Exec-managerial,White,Husband,Male,Self-emp-not-inc
38,0,0,HS-grad,9,215646,40,Divorced,United-States,Handlers-cleaners,White,Not-in-family,Male,Private
53,0,0,11th,7,234721,40,Married-civ-spouse,United-States,Handlers-cleaners,Black,Husband,Male,Private
28,0,0,Bachelors,13,338409,40,Married-civ-spouse,Cuba,Prof-specialty,Black,Wife,Female,Private
37,0,0,Masters,14,284582,40,Married-civ-spouse,United-States,Exec-managerial,White,Wife,Female,Private
49,0,0,9th,5,160187,16,Married-spouse-absent,Jamaica,Other-service,Black,Not-in-family,Female,Private
52,0,0,HS-grad,9,209642,45,Married-civ-spouse,United-States,Exec-managerial,White,Husband,Male,Self-emp-not-inc
31,14084,0,Masters,14,45781,50,Never-married,United-States,Prof-specialty,White,Not-in-family,Female,Private
42,5178,0,Bachelors,13,159449,40,Married-civ-spouse,United-States,Exec-managerial,White,Husband,Male,Private


In [11]:
HTML(f"""
<div class="container">
<p>Missing values can cause problems for Machine Learning algorithms and it is important to identify them. In total {missing_cells } cells contain missing data. We look at missing values in more detail in the next section of the report.</p>
</div>
<div class="page-break"></div>
""")

In [12]:
from pandas_profiling.model.messages import MessageType

In [13]:
def message_header(mt):
    switch = {
        MessageType.CONST : "Constant Values",
        MessageType.CORR: "Highly Correlated Variables",
        MessageType.RECODED: "Recoded Variables",
        MessageType.HIGH_CARDINALITY: "High Cardinality Variable",
        MessageType.UNSUPPORTED: "Unknown Data Types",
        MessageType.DUPLICATES: "Duplicate Observations",
        MessageType.SKEWED: "Skewed Variables",
        MessageType.MISSING: "Missing values",
        MessageType.INFINITE: "Infinite values",
        MessageType.ZEROS: "Zeros"
    } 
    return switch.get(mt,'')

def generate_message(message,cl=''):
    
    if message.message_type == MessageType.CONST: 
        m = f"has constant value <b>{ message.values['mode'] }</b>."
        
    elif message.message_type == MessageType.CORR: 
        m = f"is highly correlated with <code>{ message.values['correlation_var'] }</code> (ρ = {message.values['correlation']:0.2f})"; 
       
    elif message.message_type == MessageType.RECODED:
        m = f"is a recoding of <code>{ message.values['correlation_var'] }</code>"
       
    elif message.message_type == MessageType.HIGH_CARDINALITY:
        m = f"has a high cardinality: { message.values['distinct_count']:,} distinct values."
       
    elif message.message_type == MessageType.UNSUPPORTED:
        m = f"is an unsupported type, check if it needs cleaning or further analysis."
       
    elif message.message_type == MessageType.DUPLICATES:
        m = f"The dataset has <b>{ message.values['n_duplicates']:,} ({ 100.*message.values['p_duplicates']:.1f}%)</b> duplicate rows."
       
        return f"<p><span>{m}</span></p>"
    elif message.message_type == MessageType.SKEWED:
        m = f"is highly skewed (γ1 = { message.values['skewness']:.1f})."
       
    elif message.message_type == MessageType.MISSING:
        m = f"has { message.values['n_missing']:,} ({100.*message.values['p_missing']:.1f}%) missing values."
       
    elif message.message_type == MessageType.INFINITE:
        m = f"has { message.values['n_infinite']:,} ({ 100.*message.values['p_infinite']:.1f}%) infinite values."
       
    elif message.message_type == MessageType.ZEROS:
        m = f"has { message.values['n_zeros']:,} ({ 100.*message.values['p_zeros']:.1f}%) zeros."
       
    
    return f"<p class='{cl}'><span>The variable <code>{message.column_name}</code></span><span>{m}</p>"

messages_df=pd.DataFrame([(m,m.message_type.value) for m in description.get('messages')])

def prep_messages(mt):
    relevant_messages = messages_df[messages_df[0].apply(lambda m: m.message_type==mt)][0]
    
    if relevant_messages.shape[0] == 0:
        return None
    html_string=f"""
    <div class='alert alert-warning'>
                <p class='mb-0'>{''.join([generate_message(m) for m in relevant_messages])}</p>
              </div>
              
    """
    return html_string
    

In [14]:
    
html_string=f'''<div class="container"><h2>Observations</h2>
<p>The issues raised by this analysis are summarised in the following table. For each issue type we then provide further details and recommendations. </p>
<div class="row"><div class="col-lg-12"><div class="bs-component"><div class="list-group">
'''

for mt in [MessageType.DUPLICATES, MessageType.MISSING, MessageType.INFINITE, MessageType.ZEROS, MessageType.HIGH_CARDINALITY, MessageType.CONST, MessageType.UNSUPPORTED, MessageType.SKEWED, MessageType.RECODED, MessageType.CORR]:
    relevant_messages = messages_df[messages_df[0].apply(lambda m: m.message_type==mt)][0]
    if (relevant_messages.shape[0] > 0):
        badge_string='badge-primary'
    else:
        badge_string='badge-success'
        
    html_string+=f'''
                <li class="list-group-item list-group-item-action d-flex justify-content-between align-items-center">
                                  {message_header(mt)}

                  <span class="badge {badge_string} badge-pill">{relevant_messages.shape[0]}</span>
                </li>
                '''

html_string += '</div></div></div></div>'
HTML(html_string)

In [15]:
# Duplicate messages 

s=prep_messages(MessageType.DUPLICATES)

html_string='<div id="MessageType.DUPLICATES"><h3>3.1 Duplicate Observations</h3>'

if s:
    html_string+=f"""{s}
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No duplicate observations.</p>
                  </div>
                  """
    
html_string+=f"""
    <p>Duplicates can occur when you have combined data sets from multiple places, or have scraped data from the web or received data from clients/other departments. These can easily be removed but you should understand why they are being generated as it may highlight a data join or scraping process is not working as intended.</p>
""" 

HTML('<div class="container">'+html_string+'</div>')



In [16]:
# Missing
s=prep_messages(MessageType.MISSING)

html_string='<div id="MessageType.MISSING"><h3>3.2 Missing Values</h3>'

if s:
    html_string+=f"""{s}
                    <p>Some of the variables in the dataset contain a high percentage of missing values.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>There were no variables with high numbers of missing values.</p>
                  </div>
                  """
    
html_string+=f"""
    
<p>Missing values can cause problems for Machine Learning algorithms and it is important to carefully consider what to do with them. The two most common techniques for dealing with missing data is either to delete the observations that contain missing data or to fill in the missing values based on the other observations (e.g. filling in missing numerical values with the mean of all other observations). However, both of these approaches lead to loss of information.</p>

<p>Firstly, it is important to understand how these missing values came to be. In some circumstances, if you can deduce exactly why the missing values exist, it may be possible to fill in the missing values with a correct value. For example, imagine a process for recording sales in a store. It turns out that the process records the number of sales of a product each day but does not make a record when there are no sales. Perhaps this sales record as been joined with other data that do contain observations for each and every day into a final table. In the column for the number of sales, we would see missing values corresponding to the occasions where no sales were recorded. In this case, because we understand the process by which missing values are produced we could correctly fill in the missing values with zeros.</p>

<p>Often it is not possible to deduce the correct replacements for missing values. In these cases, we still need a way to cope with missing values so that our Machine Learning models can still make predictions.</p>

<p>For categorical variables, I recommend adding an additional class/category value to indicate that this value was missing. For missing numerical data I recommend the technique of 'flag and fill'. In this approach, an additional column is created that can be used to indicate whether the numerical variable was present or missing. Once flagged in this way, we can fill in the missing values. This could be with zeros or with some other derived value such as the mean of the other data.</p>
 
<p>I prefer these approaches to either removing observations (rows) that contain missing values or, simply imputing the missing values without flagging that the information was missing. This is because in both of these cases information is lost.</p>

""" 

HTML('<div class="container">'+html_string+'</div>')


In [17]:
s=prep_messages(MessageType.INFINITE)

html_string='<div id="MessageType.INFINITE"><h3>3.3 Infinite Values</h3>'

if s:
    html_string+=f"""{s}
                    <p>The above variables contain a high percentage of infinite values.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>There were no variables with high numbers of infinite values.</p>
                  </div>
                  """
    
html_string+=f"""
<p><p>Infinite or not-a-number values may indicate missing or corrupt data. Check the values of this variable and understand how infinite values are generated. 
The guidelines for filling in missing data are also relevant here. </p></div>
""" 

HTML('<div class="container">'+html_string+'</div>')


In [18]:
s=prep_messages(MessageType.ZEROS)

html_string='<div id="MessageType.ZEROS"><h3>3.4 Zeros</h3>'

if s:
    html_string+=f"""{s}
                    <p>The observations of the above variables contain a high percentage of zero values.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>There were no variables with unusually high numbers of zeros.</p>
                  </div>
                  """
    
html_string+=f"""
<p>A high number of zeros may indicate a structural error in data collection or an imbalanced training set. They can cause problems for machine learning models because either there won't be enough observations to sufficiently influence the model, or they can cause a model to be overfitted to a small number of extreme cases. Appropriate strategies often involve combining or reassigning these classes as part of feature engineering.</p>
""" 

HTML('<div class="container">'+html_string+'</div>')


In [27]:
s=prep_messages(MessageType.HIGH_CARDINALITY)
html_string='<div id="MessageType.HIGH_CARDINALITY"><h3>3.5 High Cardinality</h3>'
if s:
    html_string+=f"""{s}
                    <p>We detected that the above variable(s) show high cardinality, i.e. we think they are categorical variables with a large number of possible categories. 
                    It could be that this is instead a text input such as the name or description of a product. It could also indicate that this variable is an ID variable that should be removed before using many ML algorithms.</p>
                    <p>If after ruling out the above the cardinality is still high you could look towards techniques such as grouping or mean-encoding this variable during 
                    feature generation. Your knowledge of the domain will be needed here.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>None of the variables exhibited high cardinality.</p>
                  </div>
                  """
    
html_string+=f"""
<p>Categorical variables that exhibit high cardinality - i.e. they have a high number of different values or categories - can pose problems for Machine Learning Algorithms. </p>
<p>They rarely make good features and so it may be prudent to consider dropping these columns from your data.</p>
<p>One question to ask before doing that is whether the number of categories can be reduced by sensible combining of categories. For example, you should always check the category values for common errors such as capitalisations and misspellings which can expand the number of 
apparent categories.</p></div>
""" 

HTML('<div class="container">'+html_string+'</div>')



In [20]:
s=prep_messages(MessageType.CORR)
html_string='<div id="MessageType.CORR"><h3>3.6 Highly Correlated Variables</h3>'
if s:
    html_string+=f'{s}<p>We detected that the variables above show a high degree of correlation. </p>'
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No examples of highly correlated variables were found.</p>
                  </div>
                  """
    
html_string+=f"""
<p>Correlations show relationships between numeric variables in the data. In other words: how much one variable 'follows' another. A positive correlation means that as one feature increases, the other increases. A negative correlation means that as one feature increases, the other decreases or vice-versa. Smaller values of correlation indicate a weaker relationship between the variables with a value of zero indicating no relationship.</p>

<p>Although in Machine Learning we often look for correlations between variables and the quantity we want to predict, correlations between input variables may highlight redundancy or other problems with the data. For example, extremely high values of correlation between two variables may indicate duplication of information. Conversely, a low correlation between two variables that - from your knowledge of the problem domain - you would have expected to show some correlation, might indicate an error in data production or extraction.</p>
</div>
""" 

HTML('<div class="container">'+html_string+'</div>')

In [21]:
s=prep_messages(MessageType.UNSUPPORTED)
if s: display(HTML(f"""{s}"""))

In [22]:
s=prep_messages(MessageType.SKEWED)
html_string='<div id="MessageType.SKEWED"><h3>3.7 Skewed Variables</h3>'
if s:
    html_string+=f'{s}<p>The variables above seem skewed. </p>'
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No examples of skewed variables were found.</p>
                  </div>
                  """
    
html_string+=f"""
<p>Skewed variables can cause problems for machine learning algorithms. The concept is similar to the problem of training a model on imbalanced categorical classes
<p>Skew is a measure of the degree of distortion from a normal distribution. A 'right-skewed' dataset characteristically has a 'long-tail' of a few very large values.</p>
<p>Skewness may invalidate modelling assumptions, or bias the training of machine learngin models. Appropriate actions maybe to transform this variable in order to make it's distribution more normal-like. 
The tranformation will depend on the problem you are trying to model.</p>
</div>
""" 

HTML('<div class="container">'+html_string+'</div>')

In [23]:
# Constant  

s=prep_messages(MessageType.CONST)

html_string='<div id="MessageType.CONST"><h3>3.8 Constant Variable</h3>'

if s:
    html_string+=f"""{s}
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No constant variables.</p>
                  </div>
                  """
    
html_string+=f"""
    <p>Constant valued variables will not bring useful information to a Machine Learning Algorithm, consider discarding this column from the data.</p>
""" 

HTML('<div class="container">'+html_string+'</div>')


In [24]:

s=prep_messages(MessageType.UNSUPPORTED)

html_string='<div id="MessageType.UNSUPPORTED"><h3>3.9 Unknown Variable Type</h3>'

if s:
    html_string+=f"""{s} <p>The type of the above variables could not be determined. Check the integrity of the values.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>Able to parse all variable types.</p>
                  </div>
                  """
    
html_string+=f"""
    <p></p>
""" 

HTML('<div class="container">'+html_string+'</div>')

In [25]:
s=prep_messages(MessageType.RECODED)

html_string='<div id="MessageType.RECODED"><h3>3.9 Recoded Categroical Variable</h3>'

if s:
    html_string+=f"""{s}
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No recoded variables.</p>
                  </div>
                  """
    
html_string+=f"""
    <p>Recoded variables are categoricals are perfectly correlated. Consider discarding an recoded variables as they are duplicating information.</p>
""" 

HTML('<div class="container">'+html_string+'</div>')

In [26]:

html_string="""<h2>4. Individual Variable Analysis</h2>
            <p> This section presents detailed statistics of each variable in the data in turn. </p>
            <p> These statistics can be used to look for:
                <ul> 
                    <li>Potentially unwanted observations.</li>
                    <li>Structural errors, distributions that don't make sense, boundaries that don't make sense.</li>
                    <li>Outliers and possible measurement errors</li>
                    <li>The nature of missing data</li>
                </ul>
            </p>
        
            """

from pandas_profiling.view.formatters import fmt, fmt_percent, fmt_numeric

display_names = {
        'distinct_count': 'No. distinct values',
        'p_missing': '% missing',
        'p_infinite': '% infinite',
        'mean': 'Mean value',
        'min': 'Minimum value',
        'max': 'Maximum value',
        'n_zeros': 'Number of zeros'
    }
    

    
values=description

import random
vs=sorted(list(description['variables'].keys()))

for v in vs:
    
    messages_html_string=""
    relevant_messages = messages_df[list(map( lambda x: v==x.column_name, description['messages']))]
    for m in relevant_messages[0]:
        messages_html_string+=generate_message(m,cl='text-danger')
        
    
    values=description['variables'][v]
    
    
    if  values['type'] == Variable.TYPE_NUM:
        SUMMARY_HTML = f"""  
        <div class="row">
            <div class="col-sm-8">
                <h4 class="card-title">{v}</h4>
                <h6 class="card-subtitle mb-2 text-muted">{TL[values['type'].value]}</h6>
                {messages_html_string}
            </div>
            
            <div class='col-sm-4' id='minihistogram{ values['varid'] }' style='text-align:center;'>
                <img src='{ values['mini_histogram'] }' alt='Mini histogram' class='img-stats'>
            </div>
        </div>
        
       
       
        <div class="row">
            <table class='table stats'>
                <tr>
                    <th>Distinct count</th>
                    <td>{ values['distinct_count'] }</td>
                </tr>
                <tr>
                    <th>Unique (%)</th>
                    <td>{ fmt_percent(values['p_unique'])  }</td>
                </tr>
                <tr class='{ values['row_classes'].get('missing')}'>
                    <th>Missing (%)</th>
                    <td>{ fmt_percent(values['p_missing']) }</td>
                </tr>
                <tr class='{ values['row_classes'].get('missing') }'>
                    <th>Missing (n)</th>
                    <td>{ values['n_missing'] }</td>
                </tr>
                <tr class='{ values['row_classes'].get('p_infinite') }'>
                    <th>Infinite (%)</th>
                    <td>{ fmt_percent(values['p_infinite'])  }</td>
                </tr>
                <tr class='{ values['row_classes'].get('p_infinite') }'>
                    <th>Infinite (n)</th>
                    <td>{ values['n_infinite'] }</td>
                </tr>
                <tr>
                    <th>Mean</th>
                    <td>{  fmt_numeric(values['mean'],precision=4) }</td>
                </tr>
                <tr>
                    <th>Minimum</th>
                    <td>{  fmt_numeric(values['min'],precision=4) }</td>
                </tr>
                <tr>
                    <th>Maximum</th>
                    <td>{  fmt_numeric(values['max'],precision=4) }</td>
                </tr>
                <tr class='{ values['row_classes'].get('zeros') }'>
                    <th>Zeros (%)</th>
                    <td>{ fmt_percent (values['p_zeros']) }</td>
                </tr>
            </table>
        </div> 
        """
        
    elif values['type'] == Variable.TYPE_CAT:
        values['minifreqtable'] = values['minifreqtable'].replace('<div class="col-sm-6 collapse in"','<div class="col-sm-6 collapse in show" aria-expanded="true"')
        
        
        SUMMARY_HTML = f"""    
        <div class='row'>
            <div class='col-sm-4'>
                <h4 class='card-title'></code>{v}</code></h4>
                <h6 class='card-subtitle mb-2 text-muted'>{TL[values['type'].value]}</h6>
                
            </div>
            { values['minifreqtable'] }
                
        
        </div>
        
        <div class='row'>
           
                <table class='table table-hover'>
                    <tr class='{ values['row_classes'].get('distinct_count') }'>
                        <th>Distinct count</th>
                        <td>{ fmt_numeric(values['distinct_count']) }</td>
                    </tr>
                    <tr>
                        <th>Unique (%)</th>
                        <td>{ fmt_percent (values['p_unique'])  }</td>
                    </tr>
                    <tr class='{values['row_classes'].get('missing') }'>
                        <th>Missing (%)</th>
                        <td>{ fmt_percent (values['p_missing']) }</td>
                    </tr>
                    <tr class='{ values['row_classes'].get('missing') }'>
                        <th>Missing (n)</th>
                        <td>{ fmt_numeric (values['n_missing']) }</td>
                    </tr>
                </table>
            
        </div>
        
        """
        

    tabs_html = f'<ul class="nav nav-tabs" role="tablist">'

    i=0
    anchor_id = v
    for key, value in description['variables'][v]['sections'].items():
        l='active' if i == 0 else ''
        tabs_html+=f"""
                    <li role='presentation' class='nav-item'>
                        <a href='#{anchor_id}-{key}'
                           class='nav-link {l}'
                           aria-controls='{anchor_id}-{key}'
                           role='tab'
                           data-toggle='tab'>{key}</a>
                    </li>
                    """
        i+=1

    tabs_html+='</ul>'

    tabs_html+='<div class="tab-content">'


    i=0
    for key, value in description['variables'][v]['sections'].items():
        l='show active' if i == 0 else ''
        panel_content = ""
        if 'matrix' in value:
            panel_content=f'<img src="{value["matrix"]}" class="img-responsive center-img" alt="{key}">'
        elif 'value' in value:
            panel_content=f'{value["value"]}'
        elif 'content' in value:
            s=value['content']
            s=s.replace('<div class="col-sm-4 col-sm-offset-1">','<div class="col-sm-6">')
            s=s.replace('<div class="col-sm-4 col-sm-offset-2">','<div class="col-sm-6">')
            
            panel_content=f'<div class="row" style="margin-top:1em;">{s}</div>'


        tabs_html+=f"""  
            <div role="tabpanel" class="tab-pane fade {l}" id="{anchor_id}-{key}">
                <div class='row inpanel'>
                    <div class="col-sm-12">
                        {panel_content}
                    </div>
                </div>
                
            </div>
            """
        i+=1

    

    html_string+=f"""
    <div class="card border-secondary mb-3" style="margin-bottom:2em">
        <div class="card-header">Variable Analysis Results: <code>{v}</code></div>
            <div class="card-body">
                <p>{SUMMARY_HTML}</p>
                
                    {tabs_html}
                 
            </div>
        </div>
    </div>
    """

HTML('<div class="container">'+html_string+'</div>')

0,1
Distinct count,73
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0
Mean,38.58
Minimum,17
Maximum,90
Zeros (%),0.0%

0,1
Minimum,17
5-th percentile,19
Q1,28
Median,37
Q3,48
95-th percentile,63
Maximum,90
Range,73
Interquartile range,20

0,1
Standard deviation,13.64043255
Coef of variation,0.3535471837
Kurtosis,-0.1661274596
Mean,38.58164676
MAD,11.18918162
Skewness,0.5587433694
Sum,1256257
Variance,186.0614002
Memory size,254.5 KiB

Value,Count,Frequency (%),Unnamed: 3
36,898,2.8%,
31,888,2.7%,
34,886,2.7%,
23,877,2.7%,
35,876,2.7%,
33,875,2.7%,
28,867,2.7%,
30,861,2.6%,
37,858,2.6%,
25,841,2.6%,

Value,Count,Frequency (%),Unnamed: 3
17,395,1.2%,
18,550,1.7%,
19,712,2.2%,
20,753,2.3%,
21,720,2.2%,

Value,Count,Frequency (%),Unnamed: 3
90,43,0.1%,
88,3,< 0.1%,
87,1,< 0.1%,
86,1,< 0.1%,
85,3,< 0.1%,

0,1
Distinct count,119
Unique (%),0.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0
Mean,1078
Minimum,0
Maximum,1e+05
Zeros (%),91.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,5013
Maximum,99999
Range,99999
Interquartile range,0

0,1
Standard deviation,7385.292085
Coef of variation,6.853152702
Kurtosis,154.7994379
Mean,1077.648844
MAD,1977.373437
Skewness,11.95384769
Sum,35089324
Variance,54542539.18
Memory size,254.5 KiB

Value,Count,Frequency (%),Unnamed: 3
0,29849,91.7%,
15024,347,1.1%,
7688,284,0.9%,
7298,246,0.8%,
99999,159,0.5%,
5178,97,0.3%,
3103,97,0.3%,
4386,70,0.2%,
5013,69,0.2%,
8614,55,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0,29849,91.7%,
114,6,< 0.1%,
401,2,< 0.1%,
594,34,0.1%,
914,8,< 0.1%,

Value,Count,Frequency (%),Unnamed: 3
99999,159,0.5%,
41310,2,< 0.1%,
34095,5,< 0.1%,
27828,34,0.1%,
25236,11,< 0.1%,

0,1
Distinct count,92
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0
Mean,87.3
Minimum,0
Maximum,4356
Zeros (%),95.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,0
Maximum,4356
Range,4356
Interquartile range,0

0,1
Standard deviation,402.9602186
Coef of variation,4.615607584
Kurtosis,20.37680171
Mean,87.30382973
MAD,166.4620548
Skewness,4.594629122
Sum,2842700
Variance,162376.9378
Memory size,254.5 KiB

Value,Count,Frequency (%),Unnamed: 3
0,31042,95.3%,
1902,202,0.6%,
1977,168,0.5%,
1887,159,0.5%,
1848,51,0.2%,
1485,51,0.2%,
2415,49,0.2%,
1602,47,0.1%,
1740,42,0.1%,
1590,40,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,31042,95.3%,
155,1,< 0.1%,
213,4,< 0.1%,
323,3,< 0.1%,
419,3,< 0.1%,

Value,Count,Frequency (%),Unnamed: 3
4356,3,< 0.1%,
3900,2,< 0.1%,
3770,2,< 0.1%,
3683,2,< 0.1%,
3004,2,< 0.1%,

0,1
HS-grad,10501
Some-college,7291
Bachelors,5355
Other values (13),9414

0,1
Distinct count,16
Unique (%),< 0.1%
Missing (%),0.0%
Missing (n),0

Value,Count,Frequency (%),Unnamed: 3
HS-grad,10501,32.3%,
Some-college,7291,22.4%,
Bachelors,5355,16.4%,
Masters,1723,5.3%,
Assoc-voc,1382,4.2%,
11th,1175,3.6%,
Assoc-acdm,1067,3.3%,
10th,933,2.9%,
7th-8th,646,2.0%,
Prof-school,576,1.8%,

0,1
Max length,13
Mean length,9.433709038
Min length,4
Contains chars,True
Contains digits,True
Contains spaces,True
Contains non-words,True

0,1
Distinct count,16
Unique (%),< 0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0
Mean,10.08
Minimum,1
Maximum,16
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,5
Q1,9
Median,10
Q3,12
95-th percentile,14
Maximum,16
Range,15
Interquartile range,3

0,1
Standard deviation,2.572720332
Coef of variation,0.2552129916
Kurtosis,0.6234440748
Mean,10.08067934
MAD,1.90304819
Skewness,-0.3116758679
Sum,328237
Variance,6.618889907
Memory size,254.5 KiB

Value,Count,Frequency (%),Unnamed: 3
9,10501,32.3%,
10,7291,22.4%,
13,5355,16.4%,
14,1723,5.3%,
11,1382,4.2%,
7,1175,3.6%,
12,1067,3.3%,
6,933,2.9%,
4,646,2.0%,
15,576,1.8%,

Value,Count,Frequency (%),Unnamed: 3
1,51,0.2%,
2,168,0.5%,
3,333,1.0%,
4,646,2.0%,
5,514,1.6%,

Value,Count,Frequency (%),Unnamed: 3
16,413,1.3%,
15,576,1.8%,
14,1723,5.3%,
13,5355,16.4%,
12,1067,3.3%,

0,1
Distinct count,21648
Unique (%),66.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0
Mean,1.898e+05
Minimum,1.228e+04
Maximum,1.485e+06
Zeros (%),0.0%

0,1
Minimum,12285
5-th percentile,39460
Q1,117827
Median,178356
Q3,237051
95-th percentile,379682
Maximum,1484705
Range,1472420
Interquartile range,119224

0,1
Standard deviation,105549.9777
Coef of variation,0.5561749721
Kurtosis,6.218810978
Mean,189778.3665
MAD,77608.21854
Skewness,1.446980095
Sum,6179373392
Variance,1.114079779e+10
Memory size,254.5 KiB

Value,Count,Frequency (%),Unnamed: 3
164190,13,< 0.1%,
203488,13,< 0.1%,
123011,13,< 0.1%,
113364,12,< 0.1%,
121124,12,< 0.1%,
126675,12,< 0.1%,
148995,12,< 0.1%,
123983,11,< 0.1%,
190290,11,< 0.1%,
126569,11,< 0.1%,

Value,Count,Frequency (%),Unnamed: 3
12285,1,< 0.1%,
13769,1,< 0.1%,
14878,1,< 0.1%,
18827,1,< 0.1%,
19214,1,< 0.1%,

Value,Count,Frequency (%),Unnamed: 3
1484705,1,< 0.1%,
1455435,1,< 0.1%,
1366120,1,< 0.1%,
1268339,1,< 0.1%,
1226583,1,< 0.1%,

0,1
Distinct count,94
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0
Mean,40.44
Minimum,1
Maximum,99
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,18
Q1,40
Median,40
Q3,45
95-th percentile,60
Maximum,99
Range,98
Interquartile range,5

0,1
Standard deviation,12.34742868
Coef of variation,0.3053463286
Kurtosis,2.916686796
Mean,40.43745585
MAD,7.58322751
Skewness,0.2276425368
Sum,1316684
Variance,152.4589951
Memory size,254.5 KiB

Value,Count,Frequency (%),Unnamed: 3
40,15217,46.7%,
50,2819,8.7%,
45,1824,5.6%,
60,1475,4.5%,
35,1297,4.0%,
20,1224,3.8%,
30,1149,3.5%,
55,694,2.1%,
25,674,2.1%,
48,517,1.6%,

Value,Count,Frequency (%),Unnamed: 3
1,20,0.1%,
2,32,0.1%,
3,39,0.1%,
4,54,0.2%,
5,60,0.2%,

Value,Count,Frequency (%),Unnamed: 3
99,85,0.3%,
98,11,< 0.1%,
97,2,< 0.1%,
96,5,< 0.1%,
95,2,< 0.1%,

0,1
Married-civ-spouse,14976
Never-married,10683
Divorced,4443
Other values (4),2459

0,1
Distinct count,7
Unique (%),< 0.1%
Missing (%),0.0%
Missing (n),0

Value,Count,Frequency (%),Unnamed: 3
Married-civ-spouse,14976,46.0%,
Never-married,10683,32.8%,
Divorced,4443,13.6%,
Separated,1025,3.1%,
Widowed,993,3.0%,
Married-spouse-absent,418,1.3%,
Married-AF-spouse,23,0.1%,

0,1
Max length,22
Mean length,15.41405362
Min length,8
Contains chars,True
Contains digits,False
Contains spaces,True
Contains non-words,True

0,1
United-States,29170
Mexico,643
Philippines,198
Other values (38),1967
(Missing),583

0,1
Distinct count,42
Unique (%),0.1%
Missing (%),1.8%
Missing (n),583

Value,Count,Frequency (%),Unnamed: 3
United-States,29170,89.6%,
Mexico,643,2.0%,
Philippines,198,0.6%,
Germany,137,0.4%,
Canada,121,0.4%,
Puerto-Rico,114,0.4%,
El-Salvador,106,0.3%,
India,100,0.3%,
Cuba,95,0.3%,
England,90,0.3%,

0,1
Max length,27
Mean length,13.31175332
Min length,3
Contains chars,True
Contains digits,False
Contains spaces,True
Contains non-words,True

0,1
Prof-specialty,4140
Craft-repair,4099
Exec-managerial,4066
Other values (11),18413

0,1
Distinct count,15
Unique (%),< 0.1%
Missing (%),5.7%
Missing (n),1843

Value,Count,Frequency (%),Unnamed: 3
Prof-specialty,4140,12.7%,
Craft-repair,4099,12.6%,
Exec-managerial,4066,12.5%,
Adm-clerical,3770,11.6%,
Sales,3650,11.2%,
Other-service,3295,10.1%,
Machine-op-inspct,2002,6.1%,
Transport-moving,1597,4.9%,
Handlers-cleaners,1370,4.2%,
Farming-fishing,994,3.1%,

0,1
Max length,18
Mean length,13.25849943
Min length,3
Contains chars,True
Contains digits,False
Contains spaces,True
Contains non-words,True

0,1
White,27816
Black,3124
Asian-Pac-Islander,1039
Other values (2),582

0,1
Distinct count,5
Unique (%),< 0.1%
Missing (%),0.0%
Missing (n),0

Value,Count,Frequency (%),Unnamed: 3
White,27816,85.4%,
Black,3124,9.6%,
Asian-Pac-Islander,1039,3.2%,
Amer-Indian-Eskimo,311,1.0%,
Other,271,0.8%,

0,1
Max length,19
Mean length,6.53898836
Min length,6
Contains chars,True
Contains digits,False
Contains spaces,True
Contains non-words,True

0,1
Husband,13193
Not-in-family,8305
Own-child,5068
Other values (3),5995

0,1
Distinct count,6
Unique (%),< 0.1%
Missing (%),0.0%
Missing (n),0

Value,Count,Frequency (%),Unnamed: 3
Husband,13193,40.5%,
Not-in-family,8305,25.5%,
Own-child,5068,15.6%,
Unmarried,3446,10.6%,
Wife,1568,4.8%,
Other-relative,981,3.0%,

0,1
Max length,15
Mean length,10.11974448
Min length,5
Contains chars,True
Contains digits,False
Contains spaces,True
Contains non-words,True

0,1
Male,21790
Female,10771

0,1
Distinct count,2
Unique (%),< 0.1%
Missing (%),0.0%
Missing (n),0

Value,Count,Frequency (%),Unnamed: 3
Male,21790,66.9%,
Female,10771,33.1%,

0,1
Max length,7
Mean length,5.661589018
Min length,5
Contains chars,True
Contains digits,False
Contains spaces,True
Contains non-words,True

0,1
Private,22696
Self-emp-not-inc,2541
Local-gov,2093
Other values (5),3395
(Missing),1836

0,1
Distinct count,9
Unique (%),< 0.1%
Missing (%),5.6%
Missing (n),1836

Value,Count,Frequency (%),Unnamed: 3
Private,22696,69.7%,
Self-emp-not-inc,2541,7.8%,
Local-gov,2093,6.4%,
State-gov,1298,4.0%,
Self-emp-inc,1116,3.4%,
Federal-gov,960,2.9%,
Without-pay,14,< 0.1%,
Never-worked,7,< 0.1%,
(Missing),1836,5.6%,

0,1
Max length,17
Mean length,8.920794816
Min length,3
Contains chars,True
Contains digits,False
Contains spaces,True
Contains non-words,True


In [None]:
HTML("""
<div class="container">
<h2>Summary</h2>
<p>This report presented the results of an automated analysis performed on the data supplied.</p>
<p>Hopefully this report has sped up your exploratory data analysis and helped you to understand what issues may need to be addressed in order to clean and prepare this data for machine learning tasks.</p>
<p>A range of expert tools were used to generate this report. However there is no subsitute for for knowledge of the domain and the processes by which the data was generated or manipulated.
<p>If you still have questions about your dataset we would be happy to dig further into the specifics of your data set through our Standard or Premium Analysis packages.</p>
<div class='report-footer'></div>
</div>

""")
