# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import gc
import os

import sys
sys.path.insert(0, '../')
import scoring
#import importlib
#importlib.reload(scoring.data_exploration)

# Import Datasets

## Dictionary of Datasets to Import

In [None]:
datasources = {'df_main': 'demo_data/prep1_df_main.csv',
               'df_static': 'demo_data/prep1_df_static.csv',
               'df_person_features': 'demo_data/DemoVct_features_persons.csv',
               'df_prevappl_features': 'demo_data/DemoVct_features_prevappl.csv',
               'df_features_documents': 'demo_data/DemoVct_features_documents.csv'
              }

## Iterative Import

In [None]:
dataframes = {}
for df_name, df_source in datasources.items():
    dataframes[df_name] = pd.read_csv(df_source, sep = ',', decimal = '.', encoding = 'utf-8',
                                      low_memory=False, keep_default_na=False, na_values = [''])

## Join All Data

In [None]:
join_on_column = 'SKP_CREDIT_CASE'

In [None]:
for df_index, df_name in enumerate(dataframes):
    if df_index == 0:
        df_all = dataframes[df_name]
    else:
        df_all = df_all.merge(dataframes[df_name], on=join_on_column, how='outer')

## Delete Original Dataframes

In [None]:
del dataframes
gc.collect()

# Date Diffs

In [None]:
col_time = 'DATE_DECISION'

In [None]:
cols_datediff = ["AGE", "APPL_DATE_MAX_diff", "SELLERPLACEDATECREATE_diff"]

from scoring.date_tools import datetime_difference

df_all["AGE"] = datetime_difference(
    pd.to_datetime(df_all[col_time], format="%Y-%m-%d"),
    pd.to_datetime(df_all["DATE_BIRTH"], format="%d-%b-%y"),
    unit="years",
    rounding="floor",
    show_warnings=True,
    fix_y2k=True,
)

df_all["APPL_DATE_MAX_diff"] = datetime_difference(
    pd.to_datetime(df_all[col_time], format="%Y-%m-%d"),
    pd.to_datetime(df_all["APPL_DATE_MAX"], format="%Y-%m-%d"),
    unit="days",
    rounding="floor",
    show_warnings=True,
    fix_y2k=True,
)

df_all["SELLERPLACEDATECREATE_diff"] = datetime_difference(
    pd.to_datetime(df_all[col_time], format="%Y-%m-%d"),
    pd.to_datetime(df_all["SELLERPLACEDATECREATE"], format="%d-%b-%y"),
    unit="months",
    rounding="floor",
    show_warnings=True,
    fix_y2k=True,
)

# Missing and Distinct Values

In [None]:
from scoring.data_exploration import metadata_table

dq_missing_distinct_features = metadata_table(df_all)

In [None]:
dq_missing_distinct_features.to_csv('dq_missing_distinct_features.csv', encoding='utf-8')
display(dq_missing_distinct_features)

# Deleting Variables

## Set Variables with Metadata

In [None]:
metadata_variables = ['SKP_APPLICATION',
                      'SKP_CREDIT_CASE',
                      'SKP_CLIENT',
                      'DATE_DECISION',
                      'MONTH_DECISION',
                      'FLAG_FPD_30',
                      'FLAG_SPD_30',
                     ]

In [None]:
date_variables = [
]

## Too Many Missing Values

In [None]:
min_fill_percentage = 5

In [None]:
for _, entry in dq_missing_distinct_features.iterrows():
    if entry['name'] not in metadata_variables:
        if entry['fill pct'] < min_fill_percentage:
            if entry['name'] in df_all.columns:
                df_all.drop(entry['name'], axis=1, inplace=True)
                print('Column '+entry['name']+' dropped.')

## Not Enough Distinct Values

In [None]:
min_distinct_values = 2

In [None]:
for _, entry in dq_missing_distinct_features.iterrows():
    if entry['name'] not in metadata_variables:
        if entry['nunique'] < min_distinct_values:
            if entry['name'] in df_all.columns:
                df_all.drop(entry['name'], axis=1, inplace=True)
                print('Column '+entry['name']+' dropped.')

## Too Many Distinct Values (Categorical)

In [None]:
max_distinct_categories = 20

In [None]:
for _, entry in dq_missing_distinct_features.iterrows():
    if (entry['name'] not in metadata_variables) \
    and (entry['name'] not in date_variables) \
    and (entry['type'] in ('object', 'str', 'category')):
        if entry['nunique'] > max_distinct_categories:
            if entry['name'] in df_all.columns:
                df_all.drop(entry['name'], axis=1, inplace=True)
                print('Column '+entry['name']+' dropped.')

## Manual Drop

In [None]:
cols_to_drop = [
    'DATE_BIRTH',
    'APPL_DATE_MAX',
    'SELLERPLACEDATECREATE',
]

In [None]:
for col in cols_to_drop:
    if col in df_all.columns:
        df_all.drop(col, axis=1, inplace=True)
        print('Column '+col+' dropped.')

# NaN by Date

In [None]:
month_column = 'MONTH_DECISION'

In [None]:
from scoring.data_exploration import nan_share_development
dq_nan_by_month = pd.DataFrame()

if not os.path.exists('df_all'):
    os.makedirs('df_all')
dq_nan_by_month_features = nan_share_development(df_all, month_column, make_images=True, show_images=False,
                                           output_path = 'df_all/')

In [None]:
dq_nan_by_month_features.to_csv('dq_nan_by_month_features.csv', encoding='utf-8')
display(dq_nan_by_month_features)

# Create Targets from DM_UWI Flags

In [None]:
target_lengths = ['30', '60', '90']
target_instalments = ['F', 'S', 'T', 'Q']

In [None]:
for t_len in target_lengths:
    flag_names_cache = []
    for inst_idx, t_inst in enumerate(target_instalments):
        flag_name = 'FLAG_'+t_inst+'PD_'+t_len
        if flag_name in df_all.columns:
            flag_names_cache.append(flag_name)
            base_name = 'BASE_' + ''.join(target_instalments[:inst_idx+1]) + 'PD_' + t_len
            target_name = 'TARGET_' + ''.join(target_instalments[:inst_idx+1]) + 'PD_' + t_len
            base_condition = ''
            target_condition = ''
            for flag_idx, cached_flag in enumerate(flag_names_cache):
                if flag_idx > 0:
                    base_condition = base_condition + ' & '
                    target_condition = target_condition + ' | '
                base_condition = base_condition + '(pd.notnull(df_all["' + cached_flag + '"]))'
                target_condition = target_condition + '(df_all["' + cached_flag + '"] > 0)'
            df_all[base_name] = 0
            df_all[target_name] = 0
            df_all.loc[eval(base_condition), base_name] = 1
            df_all.loc[eval('(' + base_condition + ') & (' + target_condition + ')'), target_name] = 1
            print(target_name+', '+base_name+' created.')

# Data Sample Split

In [None]:
main_target = 'TARGET_FSPD_30'

In [None]:
from scoring.data_manipulation import data_sample_time_split

df_all['data_type'] = data_sample_time_split(df_all, 
                           time_column = month_column,
                           splitting_points = [201805, 201806],
                           sample_sizes = [[ 1    ],[ 0.4   , 0.3   , 0.3  ],[ 1   ]],
                           sample_names = [['hoot'],['train','valid','test'],['oot']],
                           stratify_by_columns = [month_column,main_target],
                           random_seed = 1234)

# Export Data

In [None]:
df_all.to_csv('prep2_df_all.csv', encoding='utf-8', index=False)