In [1]:
!pip install dacon_submit_api-0.0.4-py3-none-any.whl

Processing ./dacon_submit_api-0.0.4-py3-none-any.whl
dacon-submit-api is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [2]:
#cd54fb1ab20010c1d742bd9789ca793fa6a83ebf47efb0818cbe576baf14f873

In [3]:
import pandas as pd
import numpy as np
import gc

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import psutil
import os

def get_memory_usage_mb():
    process = psutil.Process(os.getpid())
    mem_bytes = process.memory_info().rss  # 현재 프로세스의 실제 메모리 사용량
    return mem_bytes / 1024 ** 2  # MB 단위로 변환

In [4]:
import pandas as pd
import gc

class CARD_container():
    def __init__(self):
        self.data_splits = ["train", "test"]
        self.data_categories = {
            "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
            "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
            "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
            "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
            "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
            "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
            "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
            "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
        }
        self.months = ['07', '08', '09', '10', '11', '12']
        self.data = {}       
        self.train_data = {} 
        self.test_data = {}  
    @staticmethod
    def get_memory_usage_mb():
        process = psutil.Process(os.getpid())
        return process.memory_info().rss / 1024 ** 2
        
    def get_data(self, path='/home/cksgh8511/키움/card', months=['07'], data_splits=["train", "test"]):
        for split in data_splits:
            for category, info in self.data_categories.items():
                folder = info["folder"]
                suffix = info["suffix"]
                var_prefix = info["var_prefix"]

                for month in months:
                    file_path = f"{path}/{split}/{folder}/2018{month}_{split}_{suffix}.parquet"
                    variable_name = f"{var_prefix}_{split}_{month}"

                    try:
                        if split == 'train':
                            self.train_data[variable_name] = pd.read_parquet(file_path)
                            print(f"{variable_name} is loaded from {file_path}")
                        elif split == 'test':
                            self.test_data[variable_name] = pd.read_parquet(file_path)
                            print(f"{variable_name} is loaded from {file_path}")
                    except Exception as e:
                        print(f"Failed to load {file_path}: {e}")

        gc.collect()

    def merge_train_data(self, months=None):
        """여러 month에 대해 train 데이터 통합"""
        if months is None:
            months = self.months  # default: ['07', ..., '12']

        for month in months:
            print(f"\n🛠️ [Train] {month}월 데이터 통합 시작")

            self.get_memory_usage_mb()

            customer_key = f'customer_train_{month}'
            credit_key = f'credit_train_{month}'

            if customer_key not in self.train_data or credit_key not in self.train_data:
                print(f"❗ {customer_key} 또는 {credit_key}가 없습니다. 스킵합니다.")
                continue

            train_df = self.train_data[customer_key].merge(
                self.train_data[credit_key], on=['기준년월', 'ID'], how='left'
            )
            print(f"Step1 저장 완료: {customer_key} + {credit_key}, shape: {train_df.shape}")

            del self.train_data[customer_key]
            del self.train_data[credit_key]
            gc.collect()

            merge_sequence = [
                ("sales", "Step2"),
                ("billing", "Step3"),
                ("balance", "Step4"),
                ("channel", "Step5"),
                ("marketing", "Step6"),
                ("performance", "최종")
            ]

            for prefix, step_name in merge_sequence:
                df_key = f'{prefix}_train_{month}'

                if df_key not in self.train_data:
                    print(f"Warning: {df_key} 없음. {step_name} 단계 스킵.")
                    continue

                train_df = train_df.merge(
                    self.train_data[df_key], on=['기준년월', 'ID'], how='left'
                )
                print(f"{step_name} 저장 완료: merge {df_key}, shape: {train_df.shape}")

                del self.train_data[df_key]
                gc.collect()

            self.get_memory_usage_mb()

            self.train_data[f'train_merged_{month}'] = train_df
            print(f"✅ 최종 통합 완료: train_merged_{month}, shape: {train_df.shape}")

    def merge_test_data(self, months=None):
        """여러 month에 대해 test 데이터 통합"""
        if months is None:
            months = self.months

        for month in months:
            print(f"\n🛠️ [Test] {month}월 데이터 통합 시작")

            self.get_memory_usage_mb()

            customer_key = f'customer_test_{month}'
            credit_key = f'credit_test_{month}'

            if customer_key not in self.test_data or credit_key not in self.test_data:
                print(f"❗ {customer_key} 또는 {credit_key}가 없습니다. 스킵합니다.")
                continue

            test_df = self.test_data[customer_key].merge(
                self.test_data[credit_key], on=['기준년월', 'ID'], how='left'
            )
            print(f"Step1 저장 완료: {customer_key} + {credit_key}, shape: {test_df.shape}")

            del self.test_data[customer_key]
            del self.test_data[credit_key]
            gc.collect()

            merge_sequence = [
                ("sales", "Step2"),
                ("billing", "Step3"),
                ("balance", "Step4"),
                ("channel", "Step5"),
                ("marketing", "Step6"),
                ("performance", "최종")
            ]

            for prefix, step_name in merge_sequence:
                df_key = f'{prefix}_test_{month}'

                if df_key not in self.test_data:
                    print(f"Warning: {df_key} 없음. {step_name} 단계 스킵.")
                    continue

                test_df = test_df.merge(
                    self.test_data[df_key], on=['기준년월', 'ID'], how='left'
                )
                print(f"{step_name} 저장 완료: merge {df_key}, shape: {test_df.shape}")

                del self.test_data[df_key]
                gc.collect()

            self.get_memory_usage_mb()

            self.test_data[f'test_merged_{month}'] = test_df
            print(f"✅ 최종 통합 완료: test_merged_{month}, shape: {test_df.shape}")
    
    def concat_merged_train_data(self, months=None):
        """여러 월의 train_merged 데이터프레임을 하나로 통합"""
        if months is None:
            months = self.months

        dfs = []
        for month in months:
            key = f"train_merged_{month}"
            if key in self.train_data:
                dfs.append(self.train_data[key])
            else:
                print(f"⚠️ {key} 없음. 스킵합니다.")

        if not dfs:
            print("❌ 병합할 데이터가 없습니다.")
            return None

        full_df = pd.concat(dfs, axis=0, ignore_index=True)
        self.train_data['train_merged_all'] = full_df
        print(f"\n🎯 전체 통합 완료: train_merged_all, shape: {full_df.shape}")
        return full_df   
    
    def concat_merged_test_data(self, months=None):
        """여러 월의 test_merged 데이터프레임을 하나로 통합"""
        if months is None:
            months = self.months

        dfs = []
        for month in months:
            key = f"test_merged_{month}"
            if key in self.test_data:
                dfs.append(self.test_data[key])
            else:
                print(f"⚠️ {key} 없음. 스킵합니다.")

        if not dfs:
            print("❌ 병합할 test 데이터가 없습니다.")
            return None

        full_df = pd.concat(dfs, axis=0, ignore_index=True)
        self.test_data['test_merged_all'] = full_df
        print(f"\n🎯 전체 통합 완료: test_merged_all, shape: {full_df.shape}")
        return full_df
    

In [6]:
card_container = CARD_container()
card_container.get_data(path='/home/cksgh8511/키움/card', months=card_container.months, data_splits=["train", 'test'])



customer_train_07 is loaded from /home/cksgh8511/키움/card/train/1.회원정보/201807_train_회원정보.parquet
customer_train_08 is loaded from /home/cksgh8511/키움/card/train/1.회원정보/201808_train_회원정보.parquet
customer_train_09 is loaded from /home/cksgh8511/키움/card/train/1.회원정보/201809_train_회원정보.parquet
customer_train_10 is loaded from /home/cksgh8511/키움/card/train/1.회원정보/201810_train_회원정보.parquet
customer_train_11 is loaded from /home/cksgh8511/키움/card/train/1.회원정보/201811_train_회원정보.parquet
customer_train_12 is loaded from /home/cksgh8511/키움/card/train/1.회원정보/201812_train_회원정보.parquet
credit_train_07 is loaded from /home/cksgh8511/키움/card/train/2.신용정보/201807_train_신용정보.parquet
credit_train_08 is loaded from /home/cksgh8511/키움/card/train/2.신용정보/201808_train_신용정보.parquet
credit_train_09 is loaded from /home/cksgh8511/키움/card/train/2.신용정보/201809_train_신용정보.parquet
credit_train_10 is loaded from /home/cksgh8511/키움/card/train/2.신용정보/201810_train_신용정보.parquet
credit_train_11 is loaded from /home/cksgh8511/키

In [8]:
# train 데이터 통합
card_container.merge_train_data(months=card_container.months)
card_container.concat_merged_train_data(months=card_container.months)


🛠️ [Train] 07월 데이터 통합 시작
Step1 저장 완료: customer_train_07 + credit_train_07, shape: (400000, 118)
Step2 저장 완료: merge sales_train_07, shape: (400000, 522)
Step3 저장 완료: merge billing_train_07, shape: (400000, 566)
Step4 저장 완료: merge balance_train_07, shape: (400000, 646)
Step5 저장 완료: merge channel_train_07, shape: (400000, 749)
Step6 저장 완료: merge marketing_train_07, shape: (400000, 811)
최종 저장 완료: merge performance_train_07, shape: (400000, 858)
✅ 최종 통합 완료: train_merged_07, shape: (400000, 858)

🛠️ [Train] 08월 데이터 통합 시작
Step1 저장 완료: customer_train_08 + credit_train_08, shape: (400000, 118)
Step2 저장 완료: merge sales_train_08, shape: (400000, 522)
Step3 저장 완료: merge billing_train_08, shape: (400000, 566)
Step4 저장 완료: merge balance_train_08, shape: (400000, 646)
Step5 저장 완료: merge channel_train_08, shape: (400000, 749)
Step6 저장 완료: merge marketing_train_08, shape: (400000, 811)
최종 저장 완료: merge performance_train_08, shape: (400000, 858)
✅ 최종 통합 완료: train_merged_08, shape: (400000, 858)

🛠️ [Tra

Unnamed: 0,기준년월,ID,남녀구분코드,연령,Segment,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,...,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M
0,201807,TRAIN_000000,2,40대,D,1,1,0,1,1,...,0.999998,1.042805,0.999700,0.999998,0.999998,0.261886,0.270752,0.000000,1.044401,1.280543
1,201807,TRAIN_000001,1,30대,E,1,1,1,1,1,...,1.092698,0.905663,0.999998,0.999998,0.999998,-0.563388,-0.670348,0.000000,0.000000,0.000000
2,201807,TRAIN_000002,1,30대,C,1,1,0,1,1,...,1.006124,1.993590,0.852567,0.999998,0.999998,-0.046516,0.058114,-0.014191,0.524159,1.208420
3,201807,TRAIN_000003,2,40대,D,1,1,0,1,2,...,0.999998,1.050646,0.999877,0.999998,0.999998,0.023821,0.258943,0.000000,0.880925,1.657124
4,201807,TRAIN_000004,2,40대,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,2,70대이상,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,
2399996,201812,TRAIN_399996,2,50대,D,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.921733,-0.203251,-0.159143,0.000000,1.377071,2.533815
2399997,201812,TRAIN_399997,1,30대,C,1,1,0,1,1,...,0.999998,0.345027,0.999998,0.999998,0.999998,0.027319,0.126581,0.000000,0.000000,0.000000
2399998,201812,TRAIN_399998,1,40대,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,


In [9]:
# test 데이터 통합
card_container.merge_test_data(months=card_container.months)  # 월별 병합 먼저
card_container.concat_merged_test_data(months=card_container.months)  # 전체 통합



🛠️ [Test] 07월 데이터 통합 시작
Step1 저장 완료: customer_test_07 + credit_test_07, shape: (100000, 117)
Step2 저장 완료: merge sales_test_07, shape: (100000, 521)
Step3 저장 완료: merge billing_test_07, shape: (100000, 565)
Step4 저장 완료: merge balance_test_07, shape: (100000, 645)
Step5 저장 완료: merge channel_test_07, shape: (100000, 748)
Step6 저장 완료: merge marketing_test_07, shape: (100000, 810)
최종 저장 완료: merge performance_test_07, shape: (100000, 857)
✅ 최종 통합 완료: test_merged_07, shape: (100000, 857)

🛠️ [Test] 08월 데이터 통합 시작
Step1 저장 완료: customer_test_08 + credit_test_08, shape: (100000, 117)
Step2 저장 완료: merge sales_test_08, shape: (100000, 521)
Step3 저장 완료: merge billing_test_08, shape: (100000, 565)
Step4 저장 완료: merge balance_test_08, shape: (100000, 645)
Step5 저장 완료: merge channel_test_08, shape: (100000, 748)
Step6 저장 완료: merge marketing_test_08, shape: (100000, 810)
최종 저장 완료: merge performance_test_08, shape: (100000, 857)
✅ 최종 통합 완료: test_merged_08, shape: (100000, 857)

🛠️ [Test] 09월 데이터 통합 시작
Ste

Unnamed: 0,기준년월,ID,남녀구분코드,연령,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,소지카드수_이용가능_신용,...,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M
0,201807,TEST_00000,1,40대,1,1,0,1,2,2,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.209395,0.231043,0.0,1.332770,1.780392
1,201807,TEST_00001,1,60대,1,1,0,1,1,1,...,0.999998,1.044473,1.991974,0.999998,0.926569,-0.269161,-0.247241,0.0,0.000000,0.000000
2,201807,TEST_00002,1,40대,1,1,1,1,2,2,...,0.999998,1.053083,0.999998,0.999998,0.999998,-0.120290,0.029270,0.0,4.123738,5.115589
3,201807,TEST_00003,2,40대,1,1,1,1,1,1,...,0.999998,1.991630,0.999998,0.999998,0.999998,0.035807,-0.013359,0.0,0.093615,0.349994
4,201807,TEST_00004,2,40대,1,0,1,1,1,1,...,0.999998,1.053743,0.999998,0.999998,0.999998,-0.538740,-0.449378,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,201812,TEST_99995,2,60대,0,0,0,0,0,0,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.0,,
599996,201812,TEST_99996,1,30대,1,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.143554,0.233616,0.0,8.564683,11.379632
599997,201812,TEST_99997,2,30대,1,1,1,1,1,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,,
599998,201812,TEST_99998,1,30대,1,1,1,1,3,3,...,0.999998,0.333183,0.999998,0.999998,0.999998,-0.038153,-0.106142,0.0,-0.241530,0.499079


In [10]:
card_container.train_data['train_merged_all'].to_parquet("train_final.parquet")
card_container.test_data['test_merged_all'].to_parquet("test_final.parquet")

In [47]:
card_container.train_data

{'customer_train_07':           기준년월            ID  남녀구분코드     연령 Segment  회원여부_이용가능  회원여부_이용가능_CA  \
 0       201807  TRAIN_000000       2    40대       D          1             1   
 1       201807  TRAIN_000001       1    30대       E          1             1   
 2       201807  TRAIN_000002       1    30대       C          1             1   
 3       201807  TRAIN_000003       2    40대       D          1             1   
 4       201807  TRAIN_000004       2    40대       E          1             1   
 ...        ...           ...     ...    ...     ...        ...           ...   
 399995  201807  TRAIN_399995       2  70대이상       E          1             1   
 399996  201807  TRAIN_399996       2    50대       D          1             1   
 399997  201807  TRAIN_399997       1    30대       C          1             1   
 399998  201807  TRAIN_399998       1    40대       E          1             1   
 399999  201807  TRAIN_399999       2    40대       E          1             1   
 
     

In [48]:
for key, v in card_container.train_data.items():
    print(key)



customer_train_07
customer_train_08
customer_train_09
customer_train_10
customer_train_11
customer_train_12
credit_train_07
credit_train_08
credit_train_09
credit_train_10
credit_train_11
credit_train_12
sales_train_07
sales_train_08
sales_train_09
sales_train_10
sales_train_11
sales_train_12
billing_train_07
billing_train_08
billing_train_09
billing_train_10
billing_train_11
billing_train_12
balance_train_07
balance_train_08
balance_train_09
balance_train_10
balance_train_11
balance_train_12
channel_train_07
channel_train_08
channel_train_09
channel_train_10
channel_train_11
channel_train_12
marketing_train_07
marketing_train_08
marketing_train_09
marketing_train_10
marketing_train_11
marketing_train_12
performance_train_07
performance_train_08
performance_train_09
performance_train_10
performance_train_11
performance_train_12


In [5]:
card_container.test_data

{'customer_test_07':          기준년월          ID  남녀구분코드   연령  회원여부_이용가능  회원여부_이용가능_CA  \
 0      201807  TEST_00000       1  40대          1             1   
 1      201807  TEST_00001       1  60대          1             1   
 2      201807  TEST_00002       1  40대          1             1   
 3      201807  TEST_00003       2  40대          1             1   
 4      201807  TEST_00004       2  40대          1             0   
 ...       ...         ...     ...  ...        ...           ...   
 99995  201807  TEST_99995       2  60대          0             0   
 99996  201807  TEST_99996       1  30대          1             1   
 99997  201807  TEST_99997       2  30대          1             1   
 99998  201807  TEST_99998       1  30대          1             1   
 99999  201807  TEST_99999       1  30대          1             1   
 
        회원여부_이용가능_카드론  소지여부_신용  소지카드수_유효_신용  소지카드수_이용가능_신용  ...  \
 0                  0        1            2              2  ...   
 1                  0       

In [49]:
import pandas as pd

class CARD_EDA():
    def __init__(self, card_container):
        self.card_container = card_container
        self.data_types = ['customer','credit', 'sales', 'billing', 'balance', 'channel', 'marketing', 'performance']

    def get_columns(self, dfs='train'):

        if dfs == 'train':
            dfs = self.card_container.train_data
        elif dfs == 'test':
            dfs = self.card_container.test_data

        for name, df in dfs.items():
            print(f"\n[{name} 컬럼 목록] ({df.shape[1]} columns)")
            print(df.columns.tolist())

    def print_unique(self, dfs='train', drop=True):
        def get_unique_info_for_df(series):
            value_counts = series.value_counts(dropna=False)
            return pd.Series({
                'n_unique': series.nunique(dropna=False),
                'value_counts': value_counts.to_dict()
            })

        if dfs == 'train':
            dfs = self.card_container.train_data
        elif dfs == 'test':
            dfs = self.card_container.test_data

        for key, value in self.constant_features_dict.items():
            
            result_df = dfs[key][value].apply(get_unique_info_for_df).T
            print(key)
            if result_df.empty:
                print("Empty ")
            else:
                print(result_df)

    def get_constant_feature(self, dfs='train', drop=True):
        if dfs == 'train':
            dfs = self.card_container.train_data
        elif dfs == 'test':
            dfs = self.card_container.test_data

        constant_features_dict = {}

        for name, df in dfs.items():
            constant_features = [
                col for col in df.columns
                if df[col].nunique(dropna=False) <= 1 and col != '기준년월'
            ]
            constant_features_dict[name] = constant_features
            if drop:
                df.drop(columns=constant_features, inplace=True)

            print(f"[{name}] 상수 feature 개수 (기준년월 제외): {len(constant_features)}개")
            if constant_features:
                print(" -", constant_features)
            else:
                print(" - 없음")

        self.constant_features_dict = constant_features_dict
    
    def get_date_features(self, dfs='train', drop=False):
        """ 날짜형 feature를 찾아 제거하는 메서드 """
        if dfs == 'train':
            dfs = self.card_container.train_data
        elif dfs == 'test':
            dfs = self.card_container.test_data

        date_features_dict = {}

        for name, df in dfs.items():
            date_features = [
                col for col in df.columns
                if (pd.api.types.is_datetime64_any_dtype(df[col]) or 'date' in col.lower() or '일자' in col)
            ]
            date_features_dict[name] = date_features
            if drop:
                df.drop(columns=date_features, inplace=True)

            print(f"[{name}] 날짜형 feature 개수: {len(date_features)}개")
            if date_features:
                print(" -", date_features)
            else:
                print(" - 없음")

        self.date_features_dict = date_features_dict

    def get_high_single_value_features(self, dfs='train', threshold=0.95, drop=True):
        """ 결측치 또는 단일값이 95% 이상인 feature 제거하는 메서드 """
        if dfs == 'train':
            dfs = self.card_container.train_data
        elif dfs == 'test':
            dfs = self.card_container.test_data

        high_single_value_features_dict = {}

        for name, df in dfs.items():
            high_single_features = []

            for col in df.columns:
                if df[col].isna().mean() >= threshold:
                    high_single_features.append(col)
                else:
                    top_freq = df[col].value_counts(normalize=True, dropna=False).values[0]
                    if top_freq >= threshold:
                        high_single_features.append(col)

            high_single_value_features_dict[name] = high_single_features

            if drop:
                df.drop(columns=high_single_features, inplace=True)

            print(f"[{name}] 95% 이상 결측/단일값 feature 개수: {len(high_single_features)}개")
            if high_single_features:
                print(" -", high_single_features)
            else:
                print(" - 없음")

        self.high_single_value_features_dict = high_single_value_features_dict


In [50]:
card_eda  = CARD_EDA(card_container)
card_eda.card_container

<__main__.CARD_container at 0x7fa050e27ce0>

In [51]:
card_eda.get_columns(dfs = 'train')


[customer_train_07 컬럼 목록] (78 columns)
['기준년월', 'ID', '남녀구분코드', '연령', 'Segment', '회원여부_이용가능', '회원여부_이용가능_CA', '회원여부_이용가능_카드론', '소지여부_신용', '소지카드수_유효_신용', '소지카드수_이용가능_신용', '입회일자_신용', '입회경과개월수_신용', '회원여부_연체', '이용거절여부_카드론', '동의여부_한도증액안내', '수신거부여부_TM', '수신거부여부_DM', '수신거부여부_메일', '수신거부여부_SMS', '가입통신회사코드', '탈회횟수_누적', '최종탈회후경과월', '탈회횟수_발급6개월이내', '탈회횟수_발급1년이내', '거주시도명', '직장시도명', '마케팅동의여부', '유효카드수_신용체크', '유효카드수_신용', '유효카드수_신용_가족', '유효카드수_체크', '유효카드수_체크_가족', '이용가능카드수_신용체크', '이용가능카드수_신용', '이용가능카드수_신용_가족', '이용가능카드수_체크', '이용가능카드수_체크_가족', '이용카드수_신용체크', '이용카드수_신용', '이용카드수_신용_가족', '이용카드수_체크', '이용카드수_체크_가족', '이용금액_R3M_신용체크', '이용금액_R3M_신용', '이용금액_R3M_신용_가족', '이용금액_R3M_체크', '이용금액_R3M_체크_가족', '_1순위카드이용금액', '_1순위카드이용건수', '_1순위신용체크구분', '_2순위카드이용금액', '_2순위카드이용건수', '_2순위신용체크구분', '최종유효년월_신용_이용가능', '최종유효년월_신용_이용', '최종카드발급일자', '보유여부_해외겸용_본인', '이용가능여부_해외겸용_본인', '이용여부_3M_해외겸용_본인', '보유여부_해외겸용_신용_본인', '이용가능여부_해외겸용_신용_본인', '이용여부_3M_해외겸용_신용_본인', '연회비발생카드수_B0M', '연회비할인카드수_B0M', '기본연회비_B0M', '제휴연회비_B0M', '할인금액_기본연회비_B0M'

In [52]:
card_eda.get_constant_feature(drop = False)

[customer_train_07] 상수 feature 개수 (기준년월 제외): 9개
 - ['이용카드수_체크_가족', '이용금액_R3M_체크_가족', '연회비할인카드수_B0M', '할인금액_기본연회비_B0M', '할인금액_제휴연회비_B0M', '상품관련면제카드수_B0M', '임직원면제카드수_B0M', '우수회원면제카드수_B0M', '기타면제카드수_B0M']
[customer_train_08] 상수 feature 개수 (기준년월 제외): 9개
 - ['이용카드수_체크_가족', '이용금액_R3M_체크_가족', '연회비할인카드수_B0M', '할인금액_기본연회비_B0M', '할인금액_제휴연회비_B0M', '상품관련면제카드수_B0M', '임직원면제카드수_B0M', '우수회원면제카드수_B0M', '기타면제카드수_B0M']
[customer_train_09] 상수 feature 개수 (기준년월 제외): 9개
 - ['이용카드수_체크_가족', '이용금액_R3M_체크_가족', '연회비할인카드수_B0M', '할인금액_기본연회비_B0M', '할인금액_제휴연회비_B0M', '상품관련면제카드수_B0M', '임직원면제카드수_B0M', '우수회원면제카드수_B0M', '기타면제카드수_B0M']
[customer_train_10] 상수 feature 개수 (기준년월 제외): 9개
 - ['이용카드수_체크_가족', '이용금액_R3M_체크_가족', '연회비할인카드수_B0M', '할인금액_기본연회비_B0M', '할인금액_제휴연회비_B0M', '상품관련면제카드수_B0M', '임직원면제카드수_B0M', '우수회원면제카드수_B0M', '기타면제카드수_B0M']
[customer_train_11] 상수 feature 개수 (기준년월 제외): 9개
 - ['이용카드수_체크_가족', '이용금액_R3M_체크_가족', '연회비할인카드수_B0M', '할인금액_기본연회비_B0M', '할인금액_제휴연회비_B0M', '상품관련면제카드수_B0M', '임직원면제카드수_B0M', '우수회원면제카드수_B0M', '기타면제

In [53]:
# balance, channel, marketing, performance
card_eda.print_unique()

customer_train_07
               n_unique    value_counts
이용카드수_체크_가족           1     {0: 400000}
이용금액_R3M_체크_가족        1     {0: 400000}
연회비할인카드수_B0M          1     {0: 400000}
할인금액_기본연회비_B0M        1     {0: 400000}
할인금액_제휴연회비_B0M        1     {0: 400000}
상품관련면제카드수_B0M         1  {'0개': 400000}
임직원면제카드수_B0M          1  {'0개': 400000}
우수회원면제카드수_B0M         1  {'0개': 400000}
기타면제카드수_B0M           1  {'0개': 400000}
customer_train_08
               n_unique    value_counts
이용카드수_체크_가족           1     {0: 400000}
이용금액_R3M_체크_가족        1     {0: 400000}
연회비할인카드수_B0M          1     {0: 400000}
할인금액_기본연회비_B0M        1     {0: 400000}
할인금액_제휴연회비_B0M        1     {0: 400000}
상품관련면제카드수_B0M         1  {'0개': 400000}
임직원면제카드수_B0M          1  {'0개': 400000}
우수회원면제카드수_B0M         1  {'0개': 400000}
기타면제카드수_B0M           1  {'0개': 400000}
customer_train_09
               n_unique    value_counts
이용카드수_체크_가족           1     {0: 400000}
이용금액_R3M_체크_가족        1     {0: 400000}
연회비할인카드수_B0M          1   

In [13]:
def get_unique_info_for_df(series):
    value_counts = series.value_counts(dropna=False)
    return pd.Series({
        'n_unique': series.nunique(dropna=False),
        'value_counts': value_counts.to_dict()
    })

for df_name in card_eda.
    result_df = card_eda.card_container.train_data['customer_train_07'][
        ['이용카드수_체크_가족', '이용금액_R3M_체크_가족', '연회비할인카드수_B0M', '할인금액_기본연회비_B0M', '할인금액_제휴연회비_B0M', '상품관련면제카드수_B0M', '임직원면제카드수_B0M', '우수회원면제카드수_B0M', '기타면제카드수_B0M']].apply(get_unique_info_for_df).T
    print(result_df)

               n_unique    value_counts
이용카드수_체크_가족           1     {0: 400000}
이용금액_R3M_체크_가족        1     {0: 400000}
연회비할인카드수_B0M          1     {0: 400000}
할인금액_기본연회비_B0M        1     {0: 400000}
할인금액_제휴연회비_B0M        1     {0: 400000}
상품관련면제카드수_B0M         1  {'0개': 400000}
임직원면제카드수_B0M          1  {'0개': 400000}
우수회원면제카드수_B0M         1  {'0개': 400000}
기타면제카드수_B0M           1  {'0개': 400000}


In [69]:
card_eda.get_constant_feature(dfs='train', drop=False)

[customer_train_07] 상수 feature 개수 (기준년월 제외): 0개
 - 없음
[credit_train_07] 상수 feature 개수 (기준년월 제외): 0개
 - 없음
[sales_train_07] 상수 feature 개수 (기준년월 제외): 0개
 - 없음
[billing_train_07] 상수 feature 개수 (기준년월 제외): 0개
 - 없음
[balance_train_07] 상수 feature 개수 (기준년월 제외): 0개
 - 없음
[channel_train_07] 상수 feature 개수 (기준년월 제외): 0개
 - 없음
[marketing_train_07] 상수 feature 개수 (기준년월 제외): 0개
 - 없음
[performance_train_07] 상수 feature 개수 (기준년월 제외): 0개
 - 없음


In [73]:
card_eda.get_date_features(dfs='train', drop=False)

[customer_train_07] 날짜형 feature 개수: 2개
 - ['입회일자_신용', '최종카드발급일자']
[credit_train_07] 날짜형 feature 개수: 1개
 - ['RV신청일자']
[sales_train_07] 날짜형 feature 개수: 8개
 - ['최종이용일자_기본', '최종이용일자_신판', '최종이용일자_CA', '최종이용일자_카드론', '최종이용일자_체크', '최종이용일자_일시불', '최종이용일자_할부', '최종카드론_대출일자']
[billing_train_07] 날짜형 feature 개수: 0개
 - 없음
[balance_train_07] 날짜형 feature 개수: 1개
 - ['연체일자_B0M']
[channel_train_07] 날짜형 feature 개수: 0개
 - 없음
[marketing_train_07] 날짜형 feature 개수: 0개
 - 없음
[performance_train_07] 날짜형 feature 개수: 0개
 - 없음


In [74]:
card_eda.get_high_single_value_features(dfs='train', threshold=0.95, drop=False)

[customer_train_07] 95% 이상 결측/단일값 feature 개수: 17개
 - ['기준년월', '회원여부_이용가능', '소지여부_신용', '회원여부_연체', '탈회횟수_발급6개월이내', '유효카드수_신용_가족', '유효카드수_체크_가족', '이용가능카드수_신용_가족', '이용가능카드수_체크_가족', '이용카드수_신용_가족', '이용금액_R3M_신용_가족', '_1순위신용체크구분', '연회비발생카드수_B0M', '기본연회비_B0M', '제휴연회비_B0M', '청구금액_기본연회비_B0M', '청구금액_제휴연회비_B0M']
[credit_train_07] 95% 이상 결측/단일값 feature 개수: 13개
 - ['기준년월', '자발한도감액횟수_R12M', '자발한도감액금액_R12M', '자발한도감액후경과월', '특별한도보유여부_R3M', '연체감액여부_R3M', '한도심사요청건수', '한도요청거절건수', '한도심사요청후경과월', '한도심사거절후경과월', '시장단기연체여부_R6M', '시장단기연체여부_R3M', '시장연체상환여부_R6M']
[sales_train_07] 95% 이상 결측/단일값 feature 개수: 106개
 - ['기준년월', '이용건수_할부_유이자_B0M', '이용건수_CA_B0M', '이용건수_카드론_B0M', '이용금액_할부_유이자_B0M', '이용금액_CA_B0M', '이용금액_카드론_B0M', '이용후경과월_부분무이자', '이용건수_부분무이자_R12M', '이용금액_부분무이자_R12M', '최대이용금액_부분무이자_R12M', '이용개월수_부분무이자_R12M', '이용건수_부분무이자_R6M', '이용건수_카드론_R6M', '이용금액_부분무이자_R6M', '이용금액_카드론_R6M', '이용개월수_부분무이자_R6M', '이용개월수_카드론_R6M', '이용건수_부분무이자_R3M', '이용건수_카드론_R3M', '이용금액_카드론_R3M', '이용개월수_부분무이자_R3M', '이용개월수_카드론_R3M', '교통_통행료이용금액', '

In [12]:
result_df

Unnamed: 0,n_unique,value_counts
이용카드수_체크_가족,1,{0: 400000}
이용금액_R3M_체크_가족,1,{0: 400000}
연회비할인카드수_B0M,1,{0: 400000}
할인금액_기본연회비_B0M,1,{0: 400000}
할인금액_제휴연회비_B0M,1,{0: 400000}
상품관련면제카드수_B0M,1,{'0개': 400000}
임직원면제카드수_B0M,1,{'0개': 400000}
우수회원면제카드수_B0M,1,{'0개': 400000}
기타면제카드수_B0M,1,{'0개': 400000}
