# Raw Data Quality Report

In this notebook we are going to create a data quality report piece-by-piece and then put it all together at the end.

In [32]:
# Import the libraries we need
import pandas as pd
import pyreadr

In [33]:
# Import the data from the source CSV file the 'Create a Pandas DataFrame From a CSV File' recipe and take a peek
credit_card_default_raw = pd.read_csv('./Data/credit_card_default.csv')
credit_card_default_raw.drop(columns=['u','train','test','validate','data.group'],inplace=True)
credit_card_default_raw.head()

Unnamed: 0.1,Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,1,1,20000,2,2,1,24,2,2,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,2,120000,2,2,2,26,-1,2,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,3,90000,2,2,2,34,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,4,50000,2,2,1,37,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,5,50000,1,2,1,57,-1,0,-1,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


## Available Columns

In [34]:
# Create a DataFrame of the columns in the credit_card_default_raw dataframe
columns = pd.DataFrame(list(credit_card_default_raw.columns.values))
columns

Unnamed: 0,0
0,Unnamed: 0
1,ID
2,LIMIT_BAL
3,SEX
4,EDUCATION
5,MARRIAGE
6,AGE
7,PAY_0
8,PAY_2
9,PAY_3


## Data Types

In [35]:
# Create a DataFrame of the data type of each column
data_types = pd.DataFrame(credit_card_default_raw.dtypes,
                          columns=['Data Type'])
data_types

Unnamed: 0,Data Type
Unnamed: 0,int64
ID,int64
LIMIT_BAL,int64
SEX,int64
EDUCATION,int64
MARRIAGE,int64
AGE,int64
PAY_0,int64
PAY_2,int64
PAY_3,int64


## Count of Missing Values in Each Column

In [36]:
# Create a DataFrame with the count of missing values in each column
missing_data_counts = pd.DataFrame(credit_card_default_raw.isnull().sum(),
                                   columns=['Missing Values'])
missing_data_counts

Unnamed: 0,Missing Values
Unnamed: 0,0
ID,0
LIMIT_BAL,0
SEX,0
EDUCATION,0
MARRIAGE,0
AGE,0
PAY_0,0
PAY_2,0
PAY_3,0


## Count of Present Values in Each Column

In [37]:
# Create a DataFrame with the count of present values in each column
present_data_counts = pd.DataFrame(credit_card_default_raw.count(),
                                   columns=['Present Values'])
present_data_counts

Unnamed: 0,Present Values
Unnamed: 0,30000
ID,30000
LIMIT_BAL,30000
SEX,30000
EDUCATION,30000
MARRIAGE,30000
AGE,30000
PAY_0,30000
PAY_2,30000
PAY_3,30000


## Number of Unique Values Per-Column

In [38]:
# Create DataFrame with the count of unique values in each column
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(credit_card_default_raw.columns.values):
    unique_value_counts.loc[v] = [credit_card_default_raw[v].nunique()]
unique_value_counts

Unnamed: 0,Unique Values
Unnamed: 0,30000
ID,30000
LIMIT_BAL,81
SEX,2
EDUCATION,7
MARRIAGE,4
AGE,56
PAY_0,11
PAY_2,11
PAY_3,11


## The Minimum Value In Each Column

In [39]:
# Create a DataFrame with the minimum value in each column
minimum_values = pd.DataFrame(columns=['Minimum Value'])
for v in list(credit_card_default_raw.columns.values):
    minimum_values.loc[v] = [credit_card_default_raw[v].min()]
minimum_values

Unnamed: 0,Minimum Value
Unnamed: 0,1
ID,1
LIMIT_BAL,10000
SEX,1
EDUCATION,0
MARRIAGE,0
AGE,21
PAY_0,-2
PAY_2,-2
PAY_3,-2


## The Maximum Value In Each Column

In [40]:
# Create a DataFrame with the minimum value in each column
maximum_values = pd.DataFrame(columns=['Maximum Value'])
for v in list(credit_card_default_raw.columns.values):
    maximum_values.loc[v] = [credit_card_default_raw[v].max()]
maximum_values.round()

Unnamed: 0,Maximum Value
Unnamed: 0,30000
ID,30000
LIMIT_BAL,1000000
SEX,2
EDUCATION,6
MARRIAGE,3
AGE,79
PAY_0,8
PAY_2,8
PAY_3,8


# Bring It All Together

In [41]:
# Merge all the DataFrames together by the index
data_quality_report = data_types.join(present_data_counts).join(missing_data_counts).join(unique_value_counts).join(minimum_values).join(maximum_values).round()

In [42]:
# Print out a nice report
print("\nData Quality Report")
print("Total records: {}".format(len(credit_card_default_raw.index)))
data_quality_report


Data Quality Report
Total records: 30000


Unnamed: 0,Data Type,Present Values,Missing Values,Unique Values,Minimum Value,Maximum Value
Unnamed: 0,int64,30000,0,30000,1,30000
ID,int64,30000,0,30000,1,30000
LIMIT_BAL,int64,30000,0,81,10000,1000000
SEX,int64,30000,0,2,1,2
EDUCATION,int64,30000,0,7,0,6
MARRIAGE,int64,30000,0,4,0,3
AGE,int64,30000,0,56,21,79
PAY_0,int64,30000,0,11,-2,8
PAY_2,int64,30000,0,11,-2,8
PAY_3,int64,30000,0,11,-2,8
