# Pandas Profiling

- Generates profile reports from a pandas DataFrame. 
- The pandas df.describe() function is great but a little basic for serious exploratory data analysis.
- pandas_profiling extends the pandas DataFrame with df.profile_report() for quick data analysis
- [pandas-profiling-github](https://github.com/pandas-profiling/pandas-profiling)

In [None]:
# On jupyterhub for each user do the below
# ! pip3 install matplotlib --user
# ! pip3 install pandas==0.25.3 --user
# ! pip3 install numpy --user

In [2]:
# ! pip3 install pandas-profiling==2.4 --user

In [3]:
import numpy as np
import pandas as pd

# Read a csv file. df stands for DataFrame
df = pd.read_csv('../height_weight.csv')
df

Unnamed: 0,sex,weight,height,repwt,repht
0,M,77,182,77.0,180.0
1,F,58,161,51.0,159.0
2,F,53,161,54.0,158.0
3,M,68,177,70.0,175.0
4,F,59,157,59.0,155.0
...,...,...,...,...,...
195,M,74,175,71.0,175.0
196,M,83,180,80.0,180.0
197,M,81,175,,
198,M,90,181,91.0,178.0


In [4]:
# Displaying through widgets
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [6]:
# Displaying through html
profile.to_notebook_iframe()

In [None]:
# Save to file
profile.to_file(output_file="my_report.html")

In [None]:
#clean up previous 
del df, profile

In [9]:
# For large datasets
# Version 2.4 introduces minimal mode. This is a default configuration that 
# disables expensive computations (such as correlations and dynamic binning)
import numpy as np
import pandas as pd

# Read a csv file. df stands for DataFrame
large_dataset = pd.read_csv('../Real_Estate_Sales_2001-2017.csv')
profile_minimal = ProfileReport(large_dataset, minimal=True)
profile_minimal

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …



In [10]:
# Normal profile
profile = ProfileReport(large_dataset, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile

Tab(children=(HTML(value='<div id="overview-content" class="row variable spacing">\n    <div class="row">\n   …

