# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import gc
import os

import sys
sys.path.insert(0, '../')
import scoring
#import importlib
#importlib.reload(scoring.data_exploration)

# Import Datasets

## Static Datasets

In [None]:
df_main = pd.read_csv('demo_data/DemoVct_base.csv', sep = ',', decimal = '.', encoding = 'utf-8',
                      low_memory=False, keep_default_na=False, na_values = [''])

In [None]:
df_static = pd.read_csv('demo_data/DemoVct_static.csv', sep = ',', decimal = '.', encoding = 'utf-8',
                            low_memory=False, keep_default_na=False, na_values = [''])

## Dynamic Datasets

In [None]:
df_persons_level1 = pd.read_csv('demo_data/DemoVct_persons.csv', sep=',', decimal='.', encoding='utf-8',
                               low_memory=False, keep_default_na=False, na_values=[''])

In [None]:
df_prevappl_level1 = pd.read_csv('demo_data/DemoVct_prevAppl.csv', sep=',', decimal='.', encoding='utf-8',
                                low_memory=False, keep_default_na=False, na_values=[''])

### Example of Loading Datasets with Multiple Parts

In [None]:
df_documents_level1_part1 = pd.read_csv('demo_data/DemoVct_documents_p1.csv', sep=',', decimal='.', encoding='utf-8',
                                       low_memory=False, keep_default_na=False, na_values=[''])
df_documents_level1_part2 = pd.read_csv('demo_data/DemoVct_documents_p2.csv', sep=',', decimal='.', encoding='utf-8',
                                       low_memory=False, keep_default_na=False, na_values=[''])
df_documents_level1_part3 = pd.read_csv('demo_data/DemoVct_documents_p3.csv', sep=',', decimal='.', encoding='utf-8',
                                       low_memory=False, keep_default_na=False, na_values=[''])

In [None]:
dataframes_concat = [df_documents_level1_part1, 
                     df_documents_level1_part2, 
                     df_documents_level1_part3]

In [None]:
df_documents_level1 = pd.concat(dataframes_concat)

In [None]:
for dfc in dataframes_concat:
    del dfc
gc.collect()

## Dictionaries of All Datasets

In [None]:
dataframes_static = {'df_main': df_main, 
                     'df_static': df_static,
                    }
dataframes_dynamic = {'df_persons_level1': df_persons_level1,
                      'df_prevappl_level1': df_prevappl_level1,
                      'df_documents_level1': df_documents_level1,
                     }
dataframes_all = dict(dataframes_static, **dataframes_dynamic)

# Missing and Distinct Values

In [None]:
from scoring.data_exploration import metadata_table
dq_missing_distinct = pd.DataFrame()

for df_name, dataframe in dataframes_all.items():
    partial_result = metadata_table(dataframe)
    partial_result['dataframe'] = df_name
    dq_missing_distinct = pd.concat([dq_missing_distinct, partial_result]).reset_index(drop=True)

In [None]:
dq_missing_distinct.to_csv('dq_missing_distinct.csv', encoding='utf-8')
display(dq_missing_distinct)

# Deleting Variables

## Set Variables not to Delete

In [None]:
metadata_variables = ['SKP_APPLICATION',
                      'SKP_CREDIT_CASE',
                      'SKP_CLIENT',
                      'DATE_DECISION',
                      'FLAG_FPD_30',
                      'FLAG_SPD_30',
                      'NUM_GROUP_POSITION',]

In [None]:
date_variables = ['DATE_BIRTH',
                  'APPL_DATE',
                  'SELLERPLACEDATECREATE']

## Too Many Missing Values

In [None]:
min_fill_percentage = 5

In [None]:
for _, entry in dq_missing_distinct.iterrows():
    if entry['name'] not in metadata_variables:
        if entry['fill pct'] < min_fill_percentage:
            if entry['name'] in dataframes_all[entry['dataframe']].columns:
                dataframes_all[entry['dataframe']].drop(entry['name'], axis=1, inplace=True)
                print('Table '+entry['dataframe']+': column '+entry['name']+' dropped.')

## Not Enough Distinct Values

In [None]:
min_distinct_values = 2

In [None]:
for _, entry in dq_missing_distinct.iterrows():
    if entry['name'] not in metadata_variables:
        if entry['nunique'] < min_distinct_values:
            if entry['name'] in dataframes_all[entry['dataframe']].columns:
                dataframes_all[entry['dataframe']].drop(entry['name'], axis=1, inplace=True)
                print('Table '+entry['dataframe']+': column '+entry['name']+' dropped.')

## Too Many Distinct Values (Categorical)

In [None]:
max_distinct_categories = 20

In [None]:
for _, entry in dq_missing_distinct.iterrows():
    if (entry['name'] not in metadata_variables) \
    and (entry['name'] not in date_variables) \
    and (entry['type'] in ('object', 'str', 'category')):
        if entry['nunique'] > max_distinct_categories:
            if entry['name'] in dataframes_all[entry['dataframe']].columns:
                dataframes_all[entry['dataframe']].drop(entry['name'], axis=1, inplace=True)
                print('Table '+entry['dataframe']+': column '+entry['name']+' dropped.')

# NaN by Date

## Create Month Column From Date in Main Table

In [None]:
date_column = 'DATE_DECISION'
month_column = 'MONTH_DECISION'
date_source_table = dataframes_all['df_main']
date_column_format = '%Y-%m-%d'

In [None]:
date_source_table.loc[:,month_column] = pd.to_numeric(
    pd.to_datetime(date_source_table[date_column], format=date_column_format).dt.strftime('%Y%m')
)

## NaN by Month Analysis

In [None]:
join_on_column = 'SKP_CREDIT_CASE'

In [None]:
from scoring.data_exploration import nan_share_development
dq_nan_by_month = pd.DataFrame()

for df_name, dataframe in dataframes_all.items():
    if not os.path.exists(df_name):
        os.makedirs(df_name)
    if not dataframe.equals(date_source_table):
        dataframe = pd.merge(dataframe, date_source_table[[join_on_column, month_column]],
                              on=join_on_column, how='outer')
    partial_result = nan_share_development(dataframe, month_column, make_images=True, show_images=False,
                                           output_path = df_name+'/')
    partial_result['dataframe'] = df_name
    partial_result['column'] = partial_result.index
    dq_nan_by_month = pd.concat([dq_nan_by_month, partial_result]).reset_index(drop=True)

In [None]:
dq_nan_by_month.to_csv('dq_nan_by_month.csv', encoding='utf-8')
display(dq_nan_by_month)

# Unique Values per Application

In [None]:
id_column = 'SKP_CREDIT_CASE'

In [None]:
from scoring.data_exploration import dynamic_diversity
dq_dynamic_diversity = pd.DataFrame()

for df_name, dataframe in dataframes_dynamic.items():
    partial_result = dynamic_diversity(dataframe, id_column)
    partial_result['dataframe'] = df_name
    partial_result['column'] = partial_result.index
    dq_dynamic_diversity = pd.concat([dq_dynamic_diversity, partial_result]).reset_index(drop=True)

In [None]:
dq_dynamic_diversity.to_csv('dq_dynamic_diversity.csv', encoding='utf-8')
display(dq_dynamic_diversity)

# Export Data

In [None]:
for df_name, dataframe in dataframes_all.items():
    dataframe.to_csv('demo_data/prep1_'+df_name+'.csv', encoding='utf-8', index=False)