In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyarrow.parquet as pq
import os

In [2]:
import gc
from tqdm import tqdm

In [6]:
directory = '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/'
file_train_list = []

for filename in os.listdir(directory):

    if os.path.isfile(os.path.join(directory, filename)):

        with open(os.path.join(directory, filename), 'r') as file:

            file_train_list.append(filename)

print(file_train_list[:5])

['train_tax_registry_c_1.parquet', 'train_static_0_0.parquet', 'train_credit_bureau_a_1_3.parquet', 'train_credit_bureau_b_2.parquet', 'train_applprev_1_1.parquet']


In [4]:
import pandas as pd

def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df[col] = df[col].astype('Int64')
        elif col in ["date_decision"]:
            df[col] = pd.to_datetime(df[col])
        elif col[-1] in ("P", "A"):
            df[col] = df[col].astype(float)
        elif col[-1] in ("D",):
            df[col] = pd.to_datetime(df[col])
    return df

def convert_strings(df):
    for col in df.columns:
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            df[col] = df[col].astype(pd.CategoricalDtype(categories=new_categories, ordered=True))
    return df


In [5]:
%%time
lower_50 = []
for file_name in tqdm(file_train_list):
    df_ = pd.read_parquet(f'/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/{file_name}')
    df_ = set_table_dtypes(df_)
    column_name = df_.columns.tolist()
    for i in column_name:
        
        #the null a lot of than 50%
        n = (df_[f'{i}'].isnull()).sum()
        if n > len(df_) / 2:
            if i not in lower_50:
                lower_50.append(i)
            else: continue
        
        #cut the string to more frequency
        col_name = df_[f'{i}'].dtype.name
        if i[-1] == "M":
            lower_50.append(i)
        if i[-1] == "D":
            lower_50.append(i)
        if col_name in ['object','string']:
            n = len(set(df_[f'{i}']))
            if n > 200 and i not in lower_50:
                lower_50.append(i)
    
#     print(len(df_.columns))

100%|██████████| 32/32 [04:05<00:00,  7.68s/it]

CPU times: user 4min 50s, sys: 2min, total: 6min 50s
Wall time: 4min 5s





In [6]:
lower_50 = [item for item in lower_50 if item[-1] != "P"]

# Select Feature

In [20]:
# remove path file that not use
remo = ['train_credit_bureau_b_2.parquet', 'train_credit_bureau_a_2_6.parquet', 'train_credit_bureau_a_2_1.parquet','train_credit_bureau_a_2_0.parquet',
        'train_credit_bureau_a_2_7.parquet', 'train_credit_bureau_a_2_5.parquet','train_credit_bureau_a_2_2.parquet','train_credit_bureau_a_2_4.parquet', 
        'train_credit_bureau_a_2_9.parquet','train_credit_bureau_a_2_3.parquet','train_applprev_2.parquet','train_credit_bureau_a_2_10.parquet',
        'train_credit_bureau_a_2_8.parquet','train_person_2.parquet', 'train_credit_bureau_a_1_3.parquet',
        'train_credit_bureau_a_1_2.parquet','train_credit_bureau_a_1_1.parquet','train_other_1.parquet', 
        'train_person_1.parquet','train_base.parquet', 'train_debitcard_1.parquet','train_tax_registry_c_1.parquet'
       ,'train_credit_bureau_b_1.parquet','train_credit_bureau_a_1_0.parquet']

In [21]:
# use this flie
file_train_list_clear = [item for item in file_train_list if item not in remo]
file_train_list_clear

['train_static_0_0.parquet',
 'train_applprev_1_1.parquet',
 'train_static_cb_0.parquet',
 'train_tax_registry_a_1.parquet',
 'train_tax_registry_b_1.parquet',
 'train_deposit_1.parquet',
 'train_static_0_1.parquet',
 'train_applprev_1_0.parquet']

In [25]:
fe_feature_list = []
for file_name in tqdm(file_train_list_clear):
    df_ = pd.read_parquet(f'/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/{file_name}')
    column_name = df_.columns.tolist()
    for i in column_name:
        # select feature
        if i[-1] in ['A', 'P', 'L', 'T'] and df_[i].dtype.name in ['int64','float64']:
            fe_feature_list.append(i)
        else: continue
        
        #the null a lot of than 50%
#         n = (df_[f'{i}'].isnull()).sum()
#         n = n // 5
#         if n > len(df_) / 2:
#             if i not in lower_50:
#                 lower_50.append(i)
#             else: continue


100%|██████████| 8/8 [00:11<00:00,  1.47s/it]


In [26]:
print(len(set(fe_feature_list)))

185


In [None]:
fe_feature_list = [item for item in fe_feature_list if item[-1] != "T"]
fe_feature_list

In [29]:
print(fe_feature_list)

['actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'daysoverduetoleranced

In [65]:
csv_feature = pd.DataFrame()
csv_feature['feature'] = ''
csv_feature['feature'] = fe_feature_list
csv_feature.to_csv('/kaggle/working/feature_select.csv')