In [1]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd

from tensorflow_metadata.proto.v0 import schema_pb2

print('TFDV Version: {}'.format(tfdv.__version__))
print('Tensorflow Version: {}'.format(tf.__version__))



TFDV Version: 1.14.0
Tensorflow Version: 2.16.1


In [2]:
import pickle

In [12]:
raw_df = pd.read_csv('../data/loan_data.csv')

In [13]:
raw_df_stats = tfdv.generate_statistics_from_dataframe(raw_df)

In [14]:
# Visualize training dataset statistics
tfdv.visualize_statistics(raw_df_stats)

In [15]:
# Infer schema from the computed statistics.
schema1 = tfdv.infer_schema(statistics=raw_df_stats)
tfdv.display_schema(schema1)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'loan_amnt',FLOAT,required,,-
'term',STRING,required,,'term'
'int_rate',FLOAT,required,,-
'installment',FLOAT,required,,-
'grade',STRING,required,,'grade'
'sub_grade',STRING,required,,'sub_grade'
'emp_title',BYTES,optional,single,-
'emp_length',STRING,optional,single,'emp_length'
'home_ownership',STRING,required,,'home_ownership'
'annual_inc',FLOAT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'term',"' 36 months', ' 60 months'"
'grade',"'A', 'B', 'C', 'D', 'E', 'F', 'G'"
'sub_grade',"'A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5'"
'emp_length',"'1 year', '10+ years', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '< 1 year'"
'home_ownership',"'ANY', 'MORTGAGE', 'NONE', 'OTHER', 'OWN', 'RENT'"
'verification_status',"'Not Verified', 'Source Verified', 'Verified'"
'loan_status',"'Charged Off', 'Fully Paid'"
'purpose',"'car', 'credit_card', 'debt_consolidation', 'educational', 'home_improvement', 'house', 'major_purchase', 'medical', 'moving', 'other', 'renewable_energy', 'small_business', 'vacation', 'wedding'"
'initial_list_status',"'f', 'w'"
'application_type',"'DIRECT_PAY', 'INDIVIDUAL', 'JOINT'"


In [17]:
anomalies1 =  tfdv.validate_statistics(statistics=raw_df_stats, schema=schema1)
# Visualize anomalies
tfdv.display_anomalies(anomalies1)

#### Conclusion : After conducting a comprehensive analysis of the raw dataset, we have identified missing values in a total of 15 categorical features, with 'emp_title' and 'emp_length' standing out due to their null values, alongside the 'mort_acc' feature among the 12 numerical columns. In response, we have initiated a rigorous preprocessing phase to tackle these missing values and enhance the overall consistency and reliability of our data processing pipeline. We further did an extensive review of the dataset's schema to assess and differentiate between its numerical and categorical values. This schema analysis allowed us to gain a comprehensive understanding of the data's structure and characteristics, informing our subsequent data processing strategies and ensuring a robust analytical approach

# Analysis on Processed data

In [18]:
with open('../dags/data/processed/after_outlier.pkl', 'rb') as f:
    data = pickle.load(f)

In [19]:
processed_df = pd.DataFrame(data)

In [20]:
processed_df_stats = tfdv.generate_statistics_from_dataframe(processed_df)

In [21]:
# Visualize training dataset statistics
tfdv.visualize_statistics(processed_df_stats)

In [9]:
# Infer schema from the computed statistics.
schema2 = tfdv.infer_schema(statistics=processed_df_stats)

# Display the inferred schema
tfdv.display_schema(schema2)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Unnamed: 0',INT,required,,-
'loan_amnt',FLOAT,required,,-
'term',INT,required,,-
'int_rate',FLOAT,required,,-
'installment',FLOAT,required,,-
...,...,...,...,...
'zipcode_4',INT,required,,-
'zipcode_7',INT,required,,-
'zipcode_8',INT,required,,-
'zipcode_9',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'loan_status',"'Charged Off', 'Fully Paid'"


In [10]:
anomalies2 =  tfdv.validate_statistics(statistics=processed_df_stats, schema=schema2)

# Visualize anomalies
tfdv.display_anomalies(anomalies2)

#### Conclusion : After carefully analyzing the preprocessed dataset, we observed that the missing values have been successfully addressed. Additionally, our application of advanced feature engineering techniques has resulted in the creation of new columns. Notably, our dataset now contains a single categorical target column, while all other columns have been converted into numerical format using the one-hot encoding method. This transformation has enhanced the dataset's suitability for further analysis and modeling purposes.