## pandas-profiling Meteorites example
Source of data: https://data.nasa.gov/Space-Science/Meteorite-Landings/gh4g-9sfh

### Import libraries

In [5]:
import pandas as pd
import pandas_profiling
import numpy as np
import os

### Load and prepare example dataset
We add some fake variables for illustrating pandas-profiling capabilities

In [3]:
df = pd.read_csv("https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD", parse_dates=['year'], encoding='UTF-8')

# Note: Pandas does not support dates before 1880, so we ignore these for this analysis
df['year'] = pd.to_datetime(df['year'], errors='coerce')
df = df[df['year'] > '1880-01-01']

# Example: Constant variable
df['source'] = "NASA"

# Example: Boolean variable
df['boolean'] = np.random.choice([True, False], df.shape[0])

# Example: Mixed with base types
df['mixed'] = np.random.choice([1, "A"], df.shape[0])

# Example: Highly correlated variables
df['reclat_city'] = df['reclat'] + np.random.normal(scale=5,size=(len(df)))

# Example: Duplicate observations
duplicates_to_add = pd.DataFrame(df.iloc[0:10])
duplicates_to_add[u'name'] = duplicates_to_add[u'name'] + " copy"

df = df.append(duplicates_to_add, ignore_index=True)

### Inline report without saving object

In [4]:
pandas_profiling.ProfileReport(df)

0,1
Number of variables,14
Number of observations,44941
Total Missing (%),3.4%
Total size in memory,4.5 MiB
Average record size in memory,105.0 B

0,1
Numeric,4
Categorical,5
Boolean,1
Date,1
Text (Unique),1
Rejected,2
Unsupported,0

First 3 values
LaPaz Icefield 03891
Lewis Cliff 88496
Grosvenor Mountains 95545

Last 3 values
Yamato 791591
Dhofar 580
Yamato 980588

Value,Count,Frequency (%),Unnamed: 3
Aarhus,1,0.0%,
Aarhus copy,1,0.0%,
Abajo,1,0.0%,
Abar al' Uj 001,1,0.0%,
Abbott,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
Österplana 062,1,0.0%,
Österplana 063,1,0.0%,
Österplana 064,1,0.0%,
Łowicz,1,0.0%,
Święcany,1,0.0%,

0,1
Distinct count,44931
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,26936
Minimum,2
Maximum,57458
Zeros (%),0.0%

0,1
Minimum,2
5-th percentile,2349
Q1,12751
Median,24407
Q3,40753
95-th percentile,54819
Maximum,57458
Range,57456
Interquartile range,28002

0,1
Standard deviation,16859
Coef of variation,0.6259
Kurtosis,-1.1661
Mean,26936
MAD,14493
Skewness,0.25782
Sum,1210509904
Variance,284220000
Memory size,351.2 KiB

Value,Count,Frequency (%),Unnamed: 3
10,2,0.0%,
423,2,0.0%,
424,2,0.0%,
379,2,0.0%,
390,2,0.0%,
370,2,0.0%,
6,2,0.0%,
398,2,0.0%,
2,2,0.0%,
417,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2,2,0.0%,
4,1,0.0%,
5,1,0.0%,
6,2,0.0%,
7,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
57454,1,0.0%,
57455,1,0.0%,
57456,1,0.0%,
57457,1,0.0%,
57458,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Valid,44867
Relict,74

Value,Count,Frequency (%),Unnamed: 3
Valid,44867,99.8%,
Relict,74,0.2%,

0,1
Distinct count,456
Unique (%),1.0%
Missing (%),0.0%
Missing (n),0

0,1
L6,8162
H5,7054
L5,4734
Other values (453),24991

Value,Count,Frequency (%),Unnamed: 3
L6,8162,18.2%,
H5,7054,15.7%,
L5,4734,10.5%,
H6,4484,10.0%,
H4,4171,9.3%,
LL5,2754,6.1%,
LL6,2027,4.5%,
L4,1209,2.7%,
H4/5,424,0.9%,
CM2,413,0.9%,

0,1
Distinct count,12455
Unique (%),27.7%
Missing (%),0.2%
Missing (n),98
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,7901.7
Minimum,0
Maximum,60000000
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,1.1
Q1,7.0
Median,31.237
Q3,189.83
95-th percentile,3379.0
Maximum,60000000.0
Range,60000000.0
Interquartile range,182.83

0,1
Standard deviation,398810
Coef of variation,50.471
Kurtosis,13328
Mean,7901.7
MAD,14764
Skewness,105.43
Sum,354340000
Variance,159050000000
Memory size,351.2 KiB

Value,Count,Frequency (%),Unnamed: 3
1.3,171,0.4%,
1.2,139,0.3%,
1.4,138,0.3%,
2.1,130,0.3%,
2.4,126,0.3%,
1.6,120,0.3%,
0.5,118,0.3%,
1.1,116,0.3%,
3.8,114,0.3%,
1.5,111,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,19,0.0%,
0.01,2,0.0%,
0.013,1,0.0%,
0.02,1,0.0%,
0.03,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
23000000.0,1,0.0%,
24000000.0,1,0.0%,
28000000.0,1,0.0%,
30000000.0,1,0.0%,
60000000.0,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Found,44140
Fell,801

Value,Count,Frequency (%),Unnamed: 3
Found,44140,98.2%,
Fell,801,1.8%,

0,1
Distinct count,135
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,1881-01-01 00:00:00
Maximum,2101-01-01 00:00:00

0,1
Distinct count,12384
Unique (%),27.6%
Missing (%),16.0%
Missing (n),7199
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-40.315
Minimum,-87.367
Maximum,81.167
Zeros (%),14.3%

0,1
Minimum,-87.367
5-th percentile,-84.372
Q1,-76.717
Median,-71.5
Q3,0.0
95-th percentile,33.681
Maximum,81.167
Range,168.53
Interquartile range,76.717

0,1
Standard deviation,45.793
Coef of variation,-1.1359
Kurtosis,-1.4492
Mean,-40.315
MAD,43.3
Skewness,0.52396
Sum,-1521600
Variance,2097
Memory size,351.2 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,6410,14.3%,
-71.5,4761,10.6%,
-84.0,3040,6.8%,
-72.0,1506,3.4%,
-79.68333,1130,2.5%,
-76.71667,680,1.5%,
-76.18333,539,1.2%,
-84.21667,263,0.6%,
-86.36667,226,0.5%,
-86.71667,217,0.5%,

Value,Count,Frequency (%),Unnamed: 3
-87.36667,4,0.0%,
-87.03333,3,0.0%,
-86.93333,3,0.0%,
-86.71667,217,0.5%,
-86.56667,17,0.0%,

Value,Count,Frequency (%),Unnamed: 3
70.73333000000001,1,0.0%,
72.68333,1,0.0%,
72.88333,1,0.0%,
76.53333,1,0.0%,
81.16667,1,0.0%,

0,1
Distinct count,14168
Unique (%),31.5%
Missing (%),16.0%
Missing (n),7199
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,62.014
Minimum,-165.43
Maximum,354.47
Zeros (%),13.8%

0,1
Minimum,-165.43
5-th percentile,-88.993
Q1,0.0
Median,35.667
Q3,157.17
95-th percentile,168.0
Maximum,354.47
Range,519.91
Interquartile range,157.17

0,1
Standard deviation,80.573
Coef of variation,1.2993
Kurtosis,-0.73065
Mean,62.014
MAD,67.799
Skewness,-0.18355
Sum,2340500
Variance,6492.1
Memory size,351.2 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,6186,13.8%,
35.66667,4985,11.1%,
168.0,3040,6.8%,
26.0,1505,3.3%,
159.75,657,1.5%,
159.66666999999998,637,1.4%,
157.16666999999998,542,1.2%,
155.75,473,1.1%,
160.5,263,0.6%,
-70.0,226,0.5%,

Value,Count,Frequency (%),Unnamed: 3
-165.43333,9,0.0%,
-165.11667,17,0.0%,
-163.16666999999998,1,0.0%,
-162.55,1,0.0%,
-157.78333,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
175.0,185,0.4%,
175.73028,1,0.0%,
178.08333000000002,1,0.0%,
178.2,1,0.0%,
354.47333,1,0.0%,

0,1
Distinct count,16481
Unique (%),36.7%
Missing (%),16.0%
Missing (n),7199

0,1
"(0.0, 0.0)",6186
"(-71.5, 35.66667)",4761
"(-84.0, 168.0)",3040
Other values (16477),23755
(Missing),7199

Value,Count,Frequency (%),Unnamed: 3
"(0.0, 0.0)",6186,13.8%,
"(-71.5, 35.66667)",4761,10.6%,
"(-84.0, 168.0)",3040,6.8%,
"(-72.0, 26.0)",1505,3.3%,
"(-79.68333, 159.75)",657,1.5%,
"(-76.71667, 159.66667)",637,1.4%,
"(-76.18333, 157.16667)",539,1.2%,
"(-79.68333, 155.75)",473,1.1%,
"(-84.21667, 160.5)",263,0.6%,
"(-86.36667, -70.0)",226,0.5%,

0,1
Constant value,NASA

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.50241

0,1
True,22579
(Missing),22362

Value,Count,Frequency (%),Unnamed: 3
True,22579,50.2%,
(Missing),22362,49.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
1,22504
A,22437

Value,Count,Frequency (%),Unnamed: 3
1,22504,50.1%,
A,22437,49.9%,

0,1
Correlation,0.99412

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,source,boolean,mixed,reclat_city
0,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01,56.18333,10.23333,"(56.18333, 10.23333)",NASA,False,1,58.944314
1,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01,54.21667,-113.0,"(54.21667, -113.0)",NASA,True,1,51.470219
2,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.9,"(16.88333, -99.9)",NASA,False,1,15.711996
3,Achiras,370,Valid,L6,780.0,Fell,1902-01-01,-33.16667,-64.95,"(-33.16667, -64.95)",NASA,False,1,-30.487834
4,Adhi Kot,379,Valid,EH4,4239.0,Fell,1919-01-01,32.1,71.8,"(32.1, 71.8)",NASA,False,A,33.360687


### Save report to file

In [6]:
pfr = pandas_profiling.ProfileReport(df)
pfr.to_file(os.path.join(os.path.curdir, 'example.html'))

#### Print existing ProfileReport object inline

In [7]:
pfr

0,1
Number of variables,14
Number of observations,44941
Total Missing (%),3.4%
Total size in memory,4.5 MiB
Average record size in memory,105.0 B

0,1
Numeric,4
Categorical,5
Boolean,1
Date,1
Text (Unique),1
Rejected,2
Unsupported,0

First 3 values
LaPaz Icefield 03891
Lewis Cliff 88496
Grosvenor Mountains 95545

Last 3 values
Yamato 791591
Dhofar 580
Yamato 980588

Value,Count,Frequency (%),Unnamed: 3
Aarhus,1,0.0%,
Aarhus copy,1,0.0%,
Abajo,1,0.0%,
Abar al' Uj 001,1,0.0%,
Abbott,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
Österplana 062,1,0.0%,
Österplana 063,1,0.0%,
Österplana 064,1,0.0%,
Łowicz,1,0.0%,
Święcany,1,0.0%,

0,1
Distinct count,44931
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,26936
Minimum,2
Maximum,57458
Zeros (%),0.0%

0,1
Minimum,2
5-th percentile,2349
Q1,12751
Median,24407
Q3,40753
95-th percentile,54819
Maximum,57458
Range,57456
Interquartile range,28002

0,1
Standard deviation,16859
Coef of variation,0.6259
Kurtosis,-1.1661
Mean,26936
MAD,14493
Skewness,0.25782
Sum,1210509904
Variance,284220000
Memory size,351.2 KiB

Value,Count,Frequency (%),Unnamed: 3
10,2,0.0%,
423,2,0.0%,
424,2,0.0%,
379,2,0.0%,
390,2,0.0%,
370,2,0.0%,
6,2,0.0%,
398,2,0.0%,
2,2,0.0%,
417,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2,2,0.0%,
4,1,0.0%,
5,1,0.0%,
6,2,0.0%,
7,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
57454,1,0.0%,
57455,1,0.0%,
57456,1,0.0%,
57457,1,0.0%,
57458,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Valid,44867
Relict,74

Value,Count,Frequency (%),Unnamed: 3
Valid,44867,99.8%,
Relict,74,0.2%,

0,1
Distinct count,456
Unique (%),1.0%
Missing (%),0.0%
Missing (n),0

0,1
L6,8162
H5,7054
L5,4734
Other values (453),24991

Value,Count,Frequency (%),Unnamed: 3
L6,8162,18.2%,
H5,7054,15.7%,
L5,4734,10.5%,
H6,4484,10.0%,
H4,4171,9.3%,
LL5,2754,6.1%,
LL6,2027,4.5%,
L4,1209,2.7%,
H4/5,424,0.9%,
CM2,413,0.9%,

0,1
Distinct count,12455
Unique (%),27.7%
Missing (%),0.2%
Missing (n),98
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,7901.7
Minimum,0
Maximum,60000000
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,1.1
Q1,7.0
Median,31.237
Q3,189.83
95-th percentile,3379.0
Maximum,60000000.0
Range,60000000.0
Interquartile range,182.83

0,1
Standard deviation,398810
Coef of variation,50.471
Kurtosis,13328
Mean,7901.7
MAD,14764
Skewness,105.43
Sum,354340000
Variance,159050000000
Memory size,351.2 KiB

Value,Count,Frequency (%),Unnamed: 3
1.3,171,0.4%,
1.2,139,0.3%,
1.4,138,0.3%,
2.1,130,0.3%,
2.4,126,0.3%,
1.6,120,0.3%,
0.5,118,0.3%,
1.1,116,0.3%,
3.8,114,0.3%,
1.5,111,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,19,0.0%,
0.01,2,0.0%,
0.013,1,0.0%,
0.02,1,0.0%,
0.03,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
23000000.0,1,0.0%,
24000000.0,1,0.0%,
28000000.0,1,0.0%,
30000000.0,1,0.0%,
60000000.0,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Found,44140
Fell,801

Value,Count,Frequency (%),Unnamed: 3
Found,44140,98.2%,
Fell,801,1.8%,

0,1
Distinct count,135
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,1881-01-01 00:00:00
Maximum,2101-01-01 00:00:00

0,1
Distinct count,12384
Unique (%),27.6%
Missing (%),16.0%
Missing (n),7199
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,-40.315
Minimum,-87.367
Maximum,81.167
Zeros (%),14.3%

0,1
Minimum,-87.367
5-th percentile,-84.372
Q1,-76.717
Median,-71.5
Q3,0.0
95-th percentile,33.681
Maximum,81.167
Range,168.53
Interquartile range,76.717

0,1
Standard deviation,45.793
Coef of variation,-1.1359
Kurtosis,-1.4492
Mean,-40.315
MAD,43.3
Skewness,0.52396
Sum,-1521600
Variance,2097
Memory size,351.2 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,6410,14.3%,
-71.5,4761,10.6%,
-84.0,3040,6.8%,
-72.0,1506,3.4%,
-79.68333,1130,2.5%,
-76.71667,680,1.5%,
-76.18333,539,1.2%,
-84.21667,263,0.6%,
-86.36667,226,0.5%,
-86.71667,217,0.5%,

Value,Count,Frequency (%),Unnamed: 3
-87.36667,4,0.0%,
-87.03333,3,0.0%,
-86.93333,3,0.0%,
-86.71667,217,0.5%,
-86.56667,17,0.0%,

Value,Count,Frequency (%),Unnamed: 3
70.73333000000001,1,0.0%,
72.68333,1,0.0%,
72.88333,1,0.0%,
76.53333,1,0.0%,
81.16667,1,0.0%,

0,1
Distinct count,14168
Unique (%),31.5%
Missing (%),16.0%
Missing (n),7199
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,62.014
Minimum,-165.43
Maximum,354.47
Zeros (%),13.8%

0,1
Minimum,-165.43
5-th percentile,-88.993
Q1,0.0
Median,35.667
Q3,157.17
95-th percentile,168.0
Maximum,354.47
Range,519.91
Interquartile range,157.17

0,1
Standard deviation,80.573
Coef of variation,1.2993
Kurtosis,-0.73065
Mean,62.014
MAD,67.799
Skewness,-0.18355
Sum,2340500
Variance,6492.1
Memory size,351.2 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,6186,13.8%,
35.66667,4985,11.1%,
168.0,3040,6.8%,
26.0,1505,3.3%,
159.75,657,1.5%,
159.66666999999998,637,1.4%,
157.16666999999998,542,1.2%,
155.75,473,1.1%,
160.5,263,0.6%,
-70.0,226,0.5%,

Value,Count,Frequency (%),Unnamed: 3
-165.43333,9,0.0%,
-165.11667,17,0.0%,
-163.16666999999998,1,0.0%,
-162.55,1,0.0%,
-157.78333,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
175.0,185,0.4%,
175.73028,1,0.0%,
178.08333000000002,1,0.0%,
178.2,1,0.0%,
354.47333,1,0.0%,

0,1
Distinct count,16481
Unique (%),36.7%
Missing (%),16.0%
Missing (n),7199

0,1
"(0.0, 0.0)",6186
"(-71.5, 35.66667)",4761
"(-84.0, 168.0)",3040
Other values (16477),23755
(Missing),7199

Value,Count,Frequency (%),Unnamed: 3
"(0.0, 0.0)",6186,13.8%,
"(-71.5, 35.66667)",4761,10.6%,
"(-84.0, 168.0)",3040,6.8%,
"(-72.0, 26.0)",1505,3.3%,
"(-79.68333, 159.75)",657,1.5%,
"(-76.71667, 159.66667)",637,1.4%,
"(-76.18333, 157.16667)",539,1.2%,
"(-79.68333, 155.75)",473,1.1%,
"(-84.21667, 160.5)",263,0.6%,
"(-86.36667, -70.0)",226,0.5%,

0,1
Constant value,NASA

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.50241

0,1
True,22579
(Missing),22362

Value,Count,Frequency (%),Unnamed: 3
True,22579,50.2%,
(Missing),22362,49.8%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
1,22504
A,22437

Value,Count,Frequency (%),Unnamed: 3
1,22504,50.1%,
A,22437,49.9%,

0,1
Correlation,0.99412

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,source,boolean,mixed,reclat_city
0,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01,56.18333,10.23333,"(56.18333, 10.23333)",NASA,False,1,58.944314
1,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01,54.21667,-113.0,"(54.21667, -113.0)",NASA,True,1,51.470219
2,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.9,"(16.88333, -99.9)",NASA,False,1,15.711996
3,Achiras,370,Valid,L6,780.0,Fell,1902-01-01,-33.16667,-64.95,"(-33.16667, -64.95)",NASA,False,1,-30.487834
4,Adhi Kot,379,Valid,EH4,4239.0,Fell,1919-01-01,32.1,71.8,"(32.1, 71.8)",NASA,False,A,33.360687
