In [28]:
from pathlib import Path
import requests
import pandas as pd
import numpy as np
import pandas_profiling
from pandas_profiling.model.base import Variable
import base64
from urllib.parse import unquote
from IPython.core.display import SVG, HTML
from IPython.display import Markdown as md
import datetime

In [29]:
todays_date = datetime.datetime.now().strftime("%B %d, %Y")

In [30]:
# Load dataset. Fetch filename from environment variable.

with open('config','r') as f:
    data_src,input_filename, = f.read().strip().split(',')

import pickle
with open(input_filename, 'rb') as f:
    profile = pickle.load(f)

docref=input_filename.split('/')[-2]

rejected_variables = profile.get_rejected_variables(threshold=0.9)
description = profile.get_description()

In [31]:


CUSTOMER_NAME = "Onabora.org"

HTML(f"""
<div class="jumbotron">
    <div class="container">
  <h1 class="display-3">Analysis Report</h1>
    <p class="lead">For {CUSTOMER_NAME} on {todays_date}.</p>
  <hr class="my-4">
  
<p class='title'>Analysis of the file [{data_src.split('/')[-1]}]</p>
<p>Document Reference: {docref}</p>
<p>stephen@onabora.com</p>
</div>
</div>

""")


In [32]:
HTML("""
<div class="container">
<h2>Introduction</h2>
<p>This report presents the results of an analysis your provided data. The aim of the report is to help you understand what issues may need to be addressed in order to clean and prepare this data for machine learning tasks, to speed up your exploratory data analysis and to provide input to feature generation activities.</p>
<p>In the first section, key statistics are described to give an overview of the data provided. This will help you to get an overall feel for the data and any special computing resources that may be needed to work with this data. </p>
<p>Subsequent sections take a more in-depth look at each of the variables detected within the data. Explainers have been included to help you to understand any potential issues found in the analysis. The report highlights steps that should be considered as part of data cleaning and the preparation of data for machine learning tasks.</p>
<p>We have used a range of expert tools to automatically generate this basic report. While the results presented here are an accurate description of the data, they are not a substitute for knowledge of the domain and the processes by which the data was generated or manipulated. We would be happy to dig further into the specifics of your data set through our Standard or Premium Analysis packages.</p>
</div>
""")


In [None]:
TL = {
    'NUM':'Numeric',
    'CAT':'Categorical',
    'BOOL':'Boolean',
    'DATE':'Date',
    'URL':'url',
    'UNIQUE':'Text (Unique)',
    'REJECTED':'Rejected',
    'UNSUPPORTED':'Unsupported'
    
}
    
summary_table = pd.DataFrame([{'name':v,'type':TL.get(description['variables'][v]['type'].value), 'missing':description['variables'][v]['n_missing']} for v in description['variables']]).sort_values(by='type')
summary_table.columns=['Column Heading','Variable Type','Number of Missing Observations']
def format_bytes(size):
    # 2**10 = 1024
    power = 2**10
    n = 0
    power_labels = {0 : '', 1: 'K', 2: 'M', 3: 'G', 4: 'T'}
    while size > power:
        size /= power
        n += 1
    return size, power_labels[n]+'B'

IN_MEM_SIZE=description['table']['memsize']
IN_MEM_SIZE_FMT = format_bytes(IN_MEM_SIZE)
N_OBS = description['table']['n']
N_VAR = description['table']['nvar']

if (IN_MEM_SIZE > 1e9):
    MEMORY_ADVICE = f"""
You may wish to consider strategies for handling large data such as: 
 - working first with a sample of the data
 - deploy cloud servers with large memory allocations
 - streaming or chunking the data
 - compress the data by relabelling categorical classes
 - using a specific big data platform.
"""
else:
    MEMORY_ADVICE = f"In most instances you should be able to work with this data without specialist big data computing resources"

missing_cells = f"{description['table']['n_cells_missing']:,}"

HTML(f"""
<div class="container">
<h2>1. Overview</h2>

<p>The data provided contained {N_OBS:,} observations (rows) of {N_VAR:,} variables (columns). The variable types and missing observations are summarized in the following table:</p>
</div>
""")

In [None]:
HTML(f'''<div class="container"><div class='col-lg-6'>{summary_table[['Column Heading','Variable Type','Number of Missing Observations']].to_html(index=False,classes='table table-hover table-striped')} <p class='caption'><em>Table 1: summary of variable types and missing values</em></p></div></div>''')


In [None]:
HTML(f"""
<div class="container">
<p>In total {missing_cells } cells contain missing data. Missing values can cause problems for Machine Learning algorithms and it is important to carefully consider what to do with them. Please see the recommendations at the end of the report for useful approaches to repairing missing values.</p>

<p>When loaded into memory the data takes around {IN_MEM_SIZE_FMT[0]:0.1f} {IN_MEM_SIZE_FMT[1]}. {MEMORY_ADVICE}</p>
<div class="page-break"></div>
<p>The resources required can depend on the Machine Learning approach that you intend to adopt. We would be happy to advise further on the resources that would be required on request.</p>
</div>
""")

In [None]:
HTML("""
<div class="container">
<h2>2. Data at a glance</h2>

<p>An extract of the first few rows of the data are shown in the table below. </p>

<p>A quick-look at this fragment can tell us a lot about whether the data makes sense. Have a look at the names and values for each variable. Are these all relevant to the problem you are trying to solve? It can save a lot of time by removing irrelevant observations before beginning feature engineering. Domain knowledge is need here.</p>
 
<p>Good questions to ask when looking at this extract ask yourself:
<ul>
<li> Are the columns those that you expected?</li>
<li> Do the values in the those columns seem reasonable given the headings?</li>
<li> Are the values on the right order-of-magnitude?</li>
</ul>
</p>
This exercise can catch mislabelling of columns or more serious errors in the data generation or collection operations. This can save a lot of wasted time and effort that would other occur if we proceeding to feature modeling with incorrect data. We are happy to work with you to assess these questions as part of the Standard or Premium packages.
<div>
""")

In [26]:
HTML('<div class="container">'+profile.sample['head'].to_html(index=False, classes='table table-hover table-striped')+'</div>')

39,_0,_13,_2174,_40,_77516,_<=50K,_Adm-clerical,_Bachelors,_Male,_Never-married,_Not-in-family,_State-gov,_United-States,_White
50,0,13,0,13,83311,<=50K,Exec-managerial,Bachelors,Male,Married-civ-spouse,Husband,Self-emp-not-inc,United-States,White
38,0,9,0,40,215646,<=50K,Handlers-cleaners,HS-grad,Male,Divorced,Not-in-family,Private,United-States,White
53,0,7,0,40,234721,<=50K,Handlers-cleaners,11th,Male,Married-civ-spouse,Husband,Private,United-States,Black
28,0,13,0,40,338409,<=50K,Prof-specialty,Bachelors,Female,Married-civ-spouse,Wife,Private,Cuba,Black
37,0,14,0,40,284582,<=50K,Exec-managerial,Masters,Female,Married-civ-spouse,Wife,Private,United-States,White
49,0,5,0,16,160187,<=50K,Other-service,9th,Female,Married-spouse-absent,Not-in-family,Private,Jamaica,Black
52,0,9,0,45,209642,>50K,Exec-managerial,HS-grad,Male,Married-civ-spouse,Husband,Self-emp-not-inc,United-States,White
31,0,14,14084,50,45781,>50K,Prof-specialty,Masters,Female,Never-married,Not-in-family,Private,United-States,White
42,0,13,5178,40,159449,>50K,Exec-managerial,Bachelors,Male,Married-civ-spouse,Husband,Private,United-States,White
37,0,10,0,80,280464,>50K,Exec-managerial,Some-college,Male,Married-civ-spouse,Husband,Private,United-States,Black


In [None]:
from pandas_profiling.model.messages import MessageType

In [None]:
def message_header(mt):
    switch = {
        MessageType.CONST : "Constant Values",
        MessageType.CORR: "Highly Correlated Variables",
        MessageType.RECODED: "Recoded Variables",
        MessageType.HIGH_CARDINALITY: "High Cardinality Variable",
        MessageType.UNSUPPORTED: "Unknown Data Types",
        MessageType.DUPLICATES: "Duplicate Observations",
        MessageType.SKEWED: "Skewed Variables",
        MessageType.MISSING: "Missing values",
        MessageType.INFINITE: "Infinite values",
        MessageType.ZEROS: "Zeros"
    } 
    return switch.get(mt,'')

def generate_message(message,cl=''):
    
    if message.message_type == MessageType.CONST: 
        m = f"has constant value <b>{ message.values['mode'] }</b>."
        
    elif message.message_type == MessageType.CORR: 
        m = f"is highly correlated with <code>{ message.values['correlation_var'] }</code> (ρ = {message.values['correlation']:0.2f})"; 
       
    elif message.message_type == MessageType.RECODED:
        m = f"is a recoding of <code>{ message.values['correlation_var'] }</code>"
       
    elif message.message_type == MessageType.HIGH_CARDINALITY:
        m = f"has a high cardinality: { message.values['distinct_count']:,} distinct values."
       
    elif message.message_type == MessageType.UNSUPPORTED:
        m = f"is an unsupported type, check if it needs cleaning or further analysis."
       
    elif message.message_type == MessageType.DUPLICATES:
        m = f"The dataset has <b>{ message.values['n_duplicates']:,} ({ 100.*message.values['p_duplicates']:.1f}%)</b> duplicate rows."
       
        return f"<p><span>{m}</span></p>"
    elif message.message_type == MessageType.SKEWED:
        m = f"is highly skewed (γ1 = { message.values['skewness']:.1f})."
       
    elif message.message_type == MessageType.MISSING:
        m = f"has { message.values['n_missing']:,} ({100.*message.values['p_missing']:.1f}%) missing values."
       
    elif message.message_type == MessageType.INFINITE:
        m = f"has { message.values['n_infinite']:,} ({ 100.*message.values['p_infinite']:.1f}%) infinite values."
       
    elif message.message_type == MessageType.ZEROS:
        m = f"has { message.values['n_zeros']:,} ({ 100.*message.values['p_zeros']:.1f}%) zeros."
       
    
    return f"<p class='{cl}'><span>The variable <code>{message.column_name}</code></span><span>{m}</p>"

messages_df=pd.DataFrame([(m,m.message_type.value) for m in description.get('messages')])

def prep_messages(mt):
    relevant_messages = messages_df[messages_df[0].apply(lambda m: m.message_type==mt)][0]
    
    if relevant_messages.shape[0] == 0:
        return None
    html_string=f"""
    <div class='alert alert-warning'>
                <p class='mb-0'>{''.join([generate_message(m) for m in relevant_messages])}</p>
              </div>
              
    """
    return html_string
    
# fillers = {
#     MessageType.MISSING: 
# 
# html_string=""
# for mt in [MessageType.DUPLICATES, MessageType.MISSING, MessageType.INFINITE, MessageType.ZEROS, MessageType.HIGH_CARDINALITY, MessageType.CONST, MessageType.UNSUPPORTED, MessageType.SKEWED, MessageType.RECODED, MessageType.CORR]:
#     relevant_messages = messages_df[messages_df[0].apply(lambda m: m.message_type==mt)][0]
#     if len(relevant_messages) > 0:
#         html_string+=f"<h2>{message_header(mt)}</h2><tr>{''.join([generate_message(m) for m in relevant_messages])}</tr>"

# HTML(html_string)

In [None]:
    
html_string=f'''<div class="container"><h2>3. Observations</h2>
<p>The analysis has highlighted the following issues with this dataset. The details follow.</p>
<div class="row"><div class="col-lg-12"><div class="bs-component"><div class="list-group">
'''

for mt in [MessageType.DUPLICATES, MessageType.MISSING, MessageType.INFINITE, MessageType.ZEROS, MessageType.HIGH_CARDINALITY, MessageType.CONST, MessageType.UNSUPPORTED, MessageType.SKEWED, MessageType.RECODED, MessageType.CORR]:
    relevant_messages = messages_df[messages_df[0].apply(lambda m: m.message_type==mt)][0]
    if (relevant_messages.shape[0] > 0):
        badge_string='badge-primary'
    else:
        badge_string='badge-success'
        
    html_string+=f'''
                <li class="list-group-item list-group-item-action d-flex justify-content-between align-items-center">
                                  {message_header(mt)}

                  <span class="badge {badge_string} badge-pill">{relevant_messages.shape[0]}</span>
                </li>
                '''

html_string += '</div></div></div></div>'
HTML(html_string)

In [None]:
# Duplicate messages 

s=prep_messages(MessageType.DUPLICATES)

html_string='<div id="MessageType.DUPLICATES"><h3>3.1 Duplicate Observations</h3>'

if s:
    html_string+=f"""{s}
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No duplicate observations.</p>
                  </div>
                  """
    
html_string+=f"""
    <p>Duplicates can occur when you have combined data sets from multiple places, or have scraped data from the web or received data from clients/other departments. These can easily be removed but you should understand why they are being generated as it may highlight a data join or scraping process is not working as intended.</p>
""" 

HTML('<div class="container">'+html_string+'</div>')



In [None]:
# Missing
s=prep_messages(MessageType.MISSING)

html_string='<div id="MessageType.MISSING"><h3>3.2 Missing Values</h3>'

if s:
    html_string+=f"""{s}
                    <p>The above variables contain a high percentage of missing values.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>There were no variables with high numbers of missing values.</p>
                  </div>
                  """
    
html_string+=f"""
    <p>Missing values can cause problems for Machine Learning algorithms and it is important to carefully consider what to do with them. 
    The two most common techniques for dealing with missing data is either to delete the observations that contain missing data or to fill 
    in the missing values based on the other observations (e.g. filling in missing numerical values with the mean of all other observations).
    However, both of these approaches lead to loss of information.</p> 
    <p>Firstly, it is important to understand how these missing values came to be. If you are sure of the process by which the missing values are being created it may be possible to fill in the missing values with a a correct 
    value. For example, imagine a process that records the number of sales of a product each day but does not make a record when there are no sales.
    This variable may have been joined with other data sources that do have a value for each and every day leading to a missing value in the data. 
    In this case, because the process by which missing values are produced is understood, it may be possible to fill in the missing values
    with the known value of zero.</p>
    <p>If the process in the real world produces missing values you need a strategy for coping with this and still making predictions.</p>
    <p>Two strategies for handling this missing data. For categorical variables we recommend adding an additional class / category value to indicate
    that this value was missing. For missing numerical data we recommend a strategy of flag and filling the values. If required we can create an 
    additional column in the data which acts as a flag of whether this numerical variable was present or missing. Once we have calculated these 
    labels we can fill in missing values to zero.</p>
    <p>We prefer this approach to either removing observations that contain missing values or simply imputing the missing values (for example 
    replacing missing values with the mean value of that variable) because in both of these cases information is lost.</p></div>
""" 

HTML('<div class="container">'+html_string+'</div>')


In [None]:
s=prep_messages(MessageType.INFINITE)

html_string='<div id="MessageType.INFINITE"><h3>3.3 Infinite Values</h3>'

if s:
    html_string+=f"""{s}
                    <p>The above variables contain a high percentage of infinite values.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>There were no variables with high numbers of infinite values.</p>
                  </div>
                  """
    
html_string+=f"""
<p><p>Infinite or not-a-number values may indicate missing or corrupt data. Check the values of this variable and understand how infinite values are generated. 
The guidelines for filling in missing data are also relevant here. </p></div>
""" 

HTML('<div class="container">'+html_string+'</div>')


In [None]:
s=prep_messages(MessageType.ZEROS)

html_string='<div id="MessageType.ZEROS"><h3>3.4 Zeros</h3>'

if s:
    html_string+=f"""{s}
                    <p>The observations of the above variables contain a high percentage of zero values.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>There were no variables with unusually high numbers of zeros.</p>
                  </div>
                  """
    
html_string+=f"""
<p>A high number of zeros may indicate a structural error in data collection or an imbalanced training set. They can cause problems for machine learning models because either there won't be enough observations to sufficently influence the model, or they can cause a model to be overfit to a small number of extreme cases. Appropriate strategies often involve combining or reassigning these classes as part of feature engineering.</div>
""" 

HTML('<div class="container">'+html_string+'</div>')


In [None]:
s=prep_messages(MessageType.HIGH_CARDINALITY)
html_string='<div id="MessageType.HIGH_CARDINALITY"><h3>3.5 High Cardinality</h3>'
if s:
    html_string+=f"""{s}
                    <p>We detected that the above variable show high cardinality, i.e. we think they are categorical variables with a large number of possible categories. 
                    It could be that this is instead a text input such as the name or description of a product. It could also indicate that this variable is an ID variable that should be removed before using many ML algorithms.</p>
                    <p>If after ruling out the above the cardinality is still high you could look towards techniques such as grouping or mean-encoding this variable during 
                    feature generation. Your knowledge of the domain will be needed here.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>None of the variables exhibited high cardinality.</p>
                  </div>
                  """
    
html_string+=f"""
<p>Some Machine Learning Algorithms struggle with high cardinality categorical variables. You should always check the category values for common errors such as capitalisations and misspellings which can expand the number of 
apparent categories.</div>
""" 

HTML('<div class="container">'+html_string+'</div>')



In [None]:
s=prep_messages(MessageType.CORR)
html_string='<div id="MessageType.CORR"><h3>3.6 Highly Correlated Variables</h3>'
if s:
    html_string+=f'{s}<p>We detected that the variables above show a high degree of correlation. </p>'
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No examples of highly correlated variables were found.</p>
                  </div>
                  """
    
html_string+=f"""
<p>Correlations show relationships between numeric variables in the data: how much one variable 'follows' another. A positive correlation means 
that as one feature increases, the other increases, while a negative correlation means that as one feature increases, the other decreases. Smaller 
values of correlation indicate a weaker relationship between the variables with 0 indicating no relationship.</p>
<p>It is important to understand correlations at this early stage because they may highlight problems with the data. For example extremely high values of correlation between two variables may indicate duplication of information. Similarly, a low correlation between two variables that would be expected to show some correlation might indicate an error in data production or extraction. 
</p></div>
""" 

HTML('<div class="container">'+html_string+'</div>')

In [None]:
s=prep_messages(MessageType.UNSUPPORTED)
if s: display(HTML(f"""{s}"""))

In [None]:
s=prep_messages(MessageType.SKEWED)
html_string='<div id="MessageType.SKEWED"><h3>3.7 Skewed Variables</h3>'
if s:
    html_string+=f'{s}<p>The variables above seem skewed. </p>'
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No examples of skewed variables were found.</p>
                  </div>
                  """
    
html_string+=f"""
<p>Skewed variables can cause problems for machine learning algorithms. The concept is similar to the problem of training a model on imbalanced categorical classes
<p>Skew is a measure of the degree of distortion from a normal distribution. A 'right-skewed' dataset characteristically has a 'long-tail' of a few very large values.</p>
<p>Skewness may invalidate modelling assumptions, or bias the training of machine learngin models. Appropriate actions maybe to transform this variable in order to make it's distribution more normal-like. 
The tranformation will depend on the problem you are trying to model.</p>
</div>
""" 

HTML('<div class="container">'+html_string+'</div>')

In [None]:
# Constant  

s=prep_messages(MessageType.CONST)

html_string='<div id="MessageType.CONST"><h3>3.8 Constant Variable</h3>'

if s:
    html_string+=f"""{s}
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No constant variables.</p>
                  </div>
                  """
    
html_string+=f"""
    <p>Constant valued variables will not bring useful information to a Machine Learning Algorithm, consider discarding this column from the data.</p>
""" 

HTML('<div class="container">'+html_string+'</div>')


In [None]:

s=prep_messages(MessageType.UNSUPPORTED)

html_string='<div id="MessageType.UNSUPPORTED"><h3>3.9 Unknown Variable Type</h3>'

if s:
    html_string+=f"""{s} <p>The type of the above variables could not be determined. Check the integrity of the values.</p>
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>Able to parse all variable types.</p>
                  </div>
                  """
    
html_string+=f"""
    <p></p>
""" 

HTML('<div class="container">'+html_string+'</div>')

In [None]:
s=prep_messages(MessageType.RECODED)

html_string='<div id="MessageType.RECODED"><h3>3.9 Recoded Categroical Variable</h3>'

if s:
    html_string+=f"""{s}
                    """
else:
    html_string+=f"""
                  <div class='alert alert-success'>
                    <p class='mb-0'>No recoded variables.</p>
                  </div>
                  """
    
html_string+=f"""
    <p>Recoded variables are categoricals are perfectly correlated. Consider discarding an recoded variables as they are duplicating information.</p>
""" 

HTML('<div class="container">'+html_string+'</div>')

In [None]:

html_string="""<h2>4. Individual Variable Analysis</h2>
            <p> This section presents detailed statistics of each variable in the data in turn. </p>
            <p> These statistics can be used to look for:
                <ul> 
                    <li>Potentially unwanted observations.</li>
                    <li>Structural errors, distributions that don't make sense, boundaries that don't make sense.</li>
                    <li>Outliers and possible measurement errors</li>
                    <li>The nature of missing data</li>
                </ul>
            </p>
        
            """

from pandas_profiling.view.formatters import fmt, fmt_percent, fmt_numeric

display_names = {
        'distinct_count': 'No. distinct values',
        'p_missing': '% missing',
        'p_infinite': '% infinite',
        'mean': 'Mean value',
        'min': 'Minimum value',
        'max': 'Maximum value',
        'n_zeros': 'Number of zeros'
    }
    

    
values=description

import random
vs=sorted(list(description['variables'].keys()))

for v in vs:
    
    messages_html_string=""
    relevant_messages = messages_df[list(map( lambda x: v==x.column_name, description['messages']))]
    for m in relevant_messages[0]:
        messages_html_string+=generate_message(m,cl='text-danger')
        
    
    values=description['variables'][v]
    
    
    if  values['type'] == Variable.TYPE_NUM:
        SUMMARY_HTML = f"""  
        <div class="row">
            <div class="col-sm-8">
                <h4 class="card-title">{v}</h4>
                <h6 class="card-subtitle mb-2 text-muted">{TL[values['type'].value]}</h6>
                {messages_html_string}
            </div>
            
            <div class='col-sm-4' id='minihistogram{ values['varid'] }' style='text-align:center;'>
                <img src='{ values['mini_histogram'] }' alt='Mini histogram' class='img-stats'>
            </div>
        </div>
        
       
       
        <div class="row">
            <table class='table stats'>
                <tr>
                    <th>Distinct count</th>
                    <td>{ values['distinct_count'] }</td>
                </tr>
                <tr>
                    <th>Unique (%)</th>
                    <td>{ fmt_percent(values['p_unique'])  }</td>
                </tr>
                <tr class='{ values['row_classes'].get('missing')}'>
                    <th>Missing (%)</th>
                    <td>{ fmt_percent(values['p_missing']) }</td>
                </tr>
                <tr class='{ values['row_classes'].get('missing') }'>
                    <th>Missing (n)</th>
                    <td>{ values['n_missing'] }</td>
                </tr>
                <tr class='{ values['row_classes'].get('p_infinite') }'>
                    <th>Infinite (%)</th>
                    <td>{ fmt_percent(values['p_infinite'])  }</td>
                </tr>
                <tr class='{ values['row_classes'].get('p_infinite') }'>
                    <th>Infinite (n)</th>
                    <td>{ values['n_infinite'] }</td>
                </tr>
                <tr>
                    <th>Mean</th>
                    <td>{  fmt_numeric(values['mean'],precision=4) }</td>
                </tr>
                <tr>
                    <th>Minimum</th>
                    <td>{  fmt_numeric(values['min'],precision=4) }</td>
                </tr>
                <tr>
                    <th>Maximum</th>
                    <td>{  fmt_numeric(values['max'],precision=4) }</td>
                </tr>
                <tr class='{ values['row_classes'].get('zeros') }'>
                    <th>Zeros (%)</th>
                    <td>{ fmt_percent (values['p_zeros']) }</td>
                </tr>
            </table>
        </div> 
        """
        
    elif values['type'] == Variable.TYPE_CAT:
        values['minifreqtable'] = values['minifreqtable'].replace('<div class="col-sm-6 collapse in"','<div class="col-sm-6 collapse in show" aria-expanded="true"')
        
        
        SUMMARY_HTML = f"""    
        <div class='row'>
            <div class='col-sm-4'>
                <h4 class='card-title'></code>{v}</code></h4>
                <h6 class='card-subtitle mb-2 text-muted'>{TL[values['type'].value]}</h6>
                
            </div>
            { values['minifreqtable'] }
                
        
        </div>
        
        <div class='row'>
           
                <table class='table table-hover'>
                    <tr class='{ values['row_classes'].get('distinct_count') }'>
                        <th>Distinct count</th>
                        <td>{ fmt_numeric(values['distinct_count']) }</td>
                    </tr>
                    <tr>
                        <th>Unique (%)</th>
                        <td>{ fmt_percent (values['p_unique'])  }</td>
                    </tr>
                    <tr class='{values['row_classes'].get('missing') }'>
                        <th>Missing (%)</th>
                        <td>{ fmt_percent (values['p_missing']) }</td>
                    </tr>
                    <tr class='{ values['row_classes'].get('missing') }'>
                        <th>Missing (n)</th>
                        <td>{ fmt_numeric (values['n_missing']) }</td>
                    </tr>
                </table>
            
        </div>
        
        """
        

    tabs_html = f'<ul class="nav nav-tabs" role="tablist">'

    i=0
    anchor_id = v
    for key, value in description['variables'][v]['sections'].items():
        l='active' if i == 0 else ''
        tabs_html+=f"""
                    <li role='presentation' class='nav-item'>
                        <a href='#{anchor_id}-{key}'
                           class='nav-link {l}'
                           aria-controls='{anchor_id}-{key}'
                           role='tab'
                           data-toggle='tab'>{key}</a>
                    </li>
                    """
        i+=1

    tabs_html+='</ul>'

    tabs_html+='<div class="tab-content">'


    i=0
    for key, value in description['variables'][v]['sections'].items():
        l='show active' if i == 0 else ''
        panel_content = ""
        if 'matrix' in value:
            panel_content=f'<img src="{value["matrix"]}" class="img-responsive center-img" alt="{key}">'
        elif 'value' in value:
            panel_content=f'{value["value"]}'
        elif 'content' in value:
            s=value['content']
            s=s.replace('<div class="col-sm-4 col-sm-offset-1">','<div class="col-sm-6">')
            s=s.replace('<div class="col-sm-4 col-sm-offset-2">','<div class="col-sm-6">')
            
            panel_content=f'<div class="row" style="margin-top:1em;">{s}</div>'


        tabs_html+=f"""  
            <div role="tabpanel" class="tab-pane fade {l}" id="{anchor_id}-{key}">
                <div class='row inpanel'>
                    <div class="col-sm-12">
                        {panel_content}
                    </div>
                </div>
                
            </div>
            """
        i+=1

    

    html_string+=f"""
    <div class="card border-secondary mb-3" style="margin-bottom:2em">
        <div class="card-header">Variable Analysis Results: <code>{v}</code></div>
            <div class="card-body">
                <p>{SUMMARY_HTML}</p>
                
                    {tabs_html}
                 
            </div>
        </div>
    </div>
    """

HTML('<div class="container">'+html_string+'</div>')

In [None]:
HTML("""
<div class="container">
<h2>Summary</h2>
<p>This report presented the results of an automated analysis performed on the data supplied.</p>
<p>Hopefully this report has sped up your exploratory data analysis and helped you to understand what issues may need to be addressed in order to clean and prepare this data for machine learning tasks.</p>
<p>A range of expert tools were used to generate this report. However there is no subsitute for for knowledge of the domain and the processes by which the data was generated or manipulated.
<p>If you still have questions about your dataset we would be happy to dig further into the specifics of your data set through our Standard or Premium Analysis packages.</p>
<div class='report-footer'></div>
</div>

""")
