In [1]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import re
import plotly.express as px

In [2]:
# In Jupyter Notebooks, one clean way of solving this problem is using markdown:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
pd.options.display.float_format = "{:.4f}".format
pd.options.display.max_rows = 500

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [3]:
df = pd.read_csv("dataset/titanic/train.csv")


In [4]:
def missing_explore(input_dataframe):
    """
        Show missing of dataframe columns
        Args:
            input_dataframe: pandas dataframe
        Returns:
            Table missing explore by columns
    """
    printmd("**Missing Explore**")
    naCount = input_dataframe.isnull().sum()
    total = len(input_dataframe)
    naPercent = (input_dataframe.isnull().sum()/len(input_dataframe)*100).round().map(lambda n: '{0:.1f} %'.format(n))
    return pd.DataFrame({'count_missing': naCount, 'count_total': len(input_dataframe),'percentile_missing':naPercent})

def hit_rate_explore(df, column):
    """
        Calculate percentage null and not null value of column in data frame
        Args:
            df: input dataframe
            column: name of column need calculate hit rate
        Returns:
            hitrate table of column
    """
    na_count = df[column].isnull().sum()
    not_na_count = df[column].notnull().sum()
    total = len(df)
    na_percentage = '{0:.1f} %'.format(na_count/total*100)
    not_na_percentage = '{0:.1f} %'.format(not_na_count/total*100)
    return pd.DataFrame({'Category': ['Have ' + column, 'Does not have ' + column],
                        'Count': [not_na_count, na_count],
                        'Total': [total, total],
                        'Percent': [not_na_percentage, na_percentage]})

def missing_explore_v2(input_dataframe):
    printmd("**Overral Stats**")
    # Train 
    total = len(input_dataframe)
    naCount = input_dataframe.isnull().sum()
    zeroCount = len(input_dataframe) - input_dataframe.fillna(1).astype(bool).sum()
    zeroPercent = (zeroCount/len(input_dataframe)*100).round().map(lambda n: '{0:.1f} %'.format(n))
    naPercent = (input_dataframe.isnull().sum()/len(input_dataframe)*100).round().map(lambda n: '{0:.1f} %'.format(n))
    uniqCount = input_dataframe.nunique()
    hitRate = (input_dataframe.notnull().sum()/len(input_dataframe)*100).round().map(lambda n: '{0:.1f} %'.format(n))
    return pd.DataFrame({'count_total': total, 'count_unique': uniqCount, 'count_zero':zeroCount,'percentile_zero':zeroPercent, 'count_missing': naCount,'percentile_missing':naPercent, 'hit_rate':hitRate})


In [5]:
missing_explore(df)

**Missing Explore**

Unnamed: 0,count_missing,count_total,percentile_missing
PassengerId,0,891,0.0 %
Survived,0,891,0.0 %
Pclass,0,891,0.0 %
Name,0,891,0.0 %
Sex,0,891,0.0 %
Age,177,891,20.0 %
SibSp,0,891,0.0 %
Parch,0,891,0.0 %
Ticket,0,891,0.0 %
Fare,0,891,0.0 %


In [6]:
missing_explore_v2(df)

**Overral Stats**

Unnamed: 0,count_total,count_unique,count_zero,percentile_zero,count_missing,percentile_missing,hit_rate
PassengerId,891,891,0,0.0 %,0,0.0 %,100.0 %
Survived,891,2,549,62.0 %,0,0.0 %,100.0 %
Pclass,891,3,0,0.0 %,0,0.0 %,100.0 %
Name,891,891,0,0.0 %,0,0.0 %,100.0 %
Sex,891,2,0,0.0 %,0,0.0 %,100.0 %
Age,891,88,0,0.0 %,177,20.0 %,80.0 %
SibSp,891,7,608,68.0 %,0,0.0 %,100.0 %
Parch,891,7,678,76.0 %,0,0.0 %,100.0 %
Ticket,891,681,0,0.0 %,0,0.0 %,100.0 %
Fare,891,248,15,2.0 %,0,0.0 %,100.0 %


In [7]:
hit_rate_explore(df, "Age")

Unnamed: 0,Category,Count,Total,Percent
0,Have Age,714,891,80.1 %
1,Does not have Age,177,891,19.9 %
