In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

dir = "/gdrive/MyDrive/"

In [3]:
def optimize_memory_usage(df):
    """칼럼별로 데이터 타입을 변환시켜 메모리 사용량을 줄여주기 위한 함수
    """
    before_mem_usage = df.memory_usage().sum() / 1024 ** 2 # byte 단위에서 mb 단위로 변환
    print(f"Memory Usage - {before_mem_usage:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            col_min = df[col].min()
            col_max = df[col].max()
            if str(col_type)[:3] == "int": # type이 int인 경우
                if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)

                elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)

                elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)

                elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  

            else: # type이 float인 경우
                if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                    # 이 경우는 parquet을 사용하기 위해 float32 포멧으로 지정해줌 (parquet은 float32이상을 지원)

                elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)

                else:
                    df[col] = df[col].astype(np.float64)

        else: # type이 object인 경우
            try:
                df[col] = pd.to_datetime(df[col])
            except:
                df[col] = df[col].astype('category')

    after_mem_usage = df.memory_usage().sum() / 1024 ** 2
    print(f'Memory Usage Optimized - {after_mem_usage:.2f} MB')
    print(f'Reduced about {(before_mem_usage - after_mem_usage) / before_mem_usage * 100:.2f} %')

    return df

In [4]:
%%time

data1 = pd.read_csv(dir + "loan_result.csv")
data1.head()

CPU times: user 12.6 s, sys: 1.98 s, total: 14.6 s
Wall time: 22.9 s


Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied
0,1748340,2022-06-07 13:05:41,7,191,42000000.0,13.6,
1,1748340,2022-06-07 13:05:41,25,169,24000000.0,17.9,
2,1748340,2022-06-07 13:05:41,2,7,24000000.0,18.5,
3,1748340,2022-06-07 13:05:41,4,268,29000000.0,10.8,
4,1748340,2022-06-07 13:05:41,11,118,5000000.0,16.4,


In [5]:
data1 = optimize_memory_usage(data1)

Memory Usage - 722.44 MB
Memory Usage Optimized - 348.32 MB
Reduced about 51.79 %


In [6]:
data1.head()

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied
0,1748340,2022-06-07 13:05:41,7,191,42000000.0,13.6,
1,1748340,2022-06-07 13:05:41,25,169,24000000.0,17.9,
2,1748340,2022-06-07 13:05:41,2,7,24000000.0,18.5,
3,1748340,2022-06-07 13:05:41,4,268,29000000.0,10.8,
4,1748340,2022-06-07 13:05:41,11,118,5000000.0,16.4,


In [7]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13527363 entries, 0 to 13527362
Data columns (total 7 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   application_id         int32         
 1   loanapply_insert_time  datetime64[ns]
 2   bank_id                int8          
 3   product_id             int16         
 4   loan_limit             float32       
 5   loan_rate              float32       
 6   is_applied             float32       
dtypes: datetime64[ns](1), float32(3), int16(1), int32(1), int8(1)
memory usage: 348.3 MB


In [51]:
#data.to_parquet(dir + 'loan_result.parquet', index=False)

In [8]:
%%time

data2 = pd.read_csv(dir + 'user_spec.csv')
data2.head()

CPU times: user 3.67 s, sys: 814 ms, total: 4.49 s
Wall time: 4.9 s


Unnamed: 0,application_id,user_id,birth_year,gender,insert_time,credit_score,yearly_income,income_type,company_enter_month,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt
0,1249046,118218,1985.0,1.0,2022-06-07 06:28:18,660.0,108000000.0,PRIVATEBUSINESS,20151101.0,기타,자가,1000000.0,기타,0.0,,4.0,162000000.0
1,954900,553686,1968.0,1.0,2022-06-07 14:29:03,870.0,30000000.0,PRIVATEBUSINESS,20070201.0,정규직,기타가족소유,30000000.0,대환대출,0.0,,1.0,27000000.0
2,137274,59516,1997.0,1.0,2022-06-07 21:40:22,710.0,30000000.0,FREELANCER,20210901.0,기타,기타가족소유,10000000.0,생활비,0.0,,5.0,15000000.0
3,1570936,167320,1989.0,1.0,2022-06-07 09:40:27,820.0,62000000.0,EARNEDINCOME,20170101.0,정규직,자가,2000000.0,생활비,0.0,,7.0,344000000.0
4,967833,33400,2000.0,1.0,2022-06-07 08:55:07,630.0,36000000.0,EARNEDINCOME,20210901.0,정규직,기타가족소유,5000000.0,생활비,0.0,0.0,1.0,16000000.0


In [9]:
data2 = optimize_memory_usage(data2)
data2.head()

Memory Usage - 180.83 MB
Memory Usage Optimized - 79.78 MB
Reduced about 55.88 %


Unnamed: 0,application_id,user_id,birth_year,gender,insert_time,credit_score,yearly_income,income_type,company_enter_month,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt
0,1249046,118218,1985.0,1.0,2022-06-07 06:28:18,660.0,108000000.0,PRIVATEBUSINESS,20151100.0,기타,자가,1000000.0,기타,0.0,,4.0,162000000.0
1,954900,553686,1968.0,1.0,2022-06-07 14:29:03,870.0,30000000.0,PRIVATEBUSINESS,20070200.0,정규직,기타가족소유,30000000.0,대환대출,0.0,,1.0,27000000.0
2,137274,59516,1997.0,1.0,2022-06-07 21:40:22,710.0,30000000.0,FREELANCER,20210900.0,기타,기타가족소유,10000000.0,생활비,0.0,,5.0,15000000.0
3,1570936,167320,1989.0,1.0,2022-06-07 09:40:27,820.0,62000000.0,EARNEDINCOME,20170100.0,정규직,자가,2000000.0,생활비,0.0,,7.0,344000000.0
4,967833,33400,2000.0,1.0,2022-06-07 08:55:07,630.0,36000000.0,EARNEDINCOME,20210900.0,정규직,기타가족소유,5000000.0,생활비,0.0,0.0,1.0,16000000.0


In [10]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1394216 entries, 0 to 1394215
Data columns (total 17 columns):
 #   Column                               Non-Null Count    Dtype         
---  ------                               --------------    -----         
 0   application_id                       1394216 non-null  int32         
 1   user_id                              1394216 non-null  int32         
 2   birth_year                           1381255 non-null  float32       
 3   gender                               1381255 non-null  float32       
 4   insert_time                          1394216 non-null  datetime64[ns]
 5   credit_score                         1289101 non-null  float32       
 6   yearly_income                        1394126 non-null  float32       
 7   income_type                          1394131 non-null  category      
 8   company_enter_month                  1222456 non-null  float32       
 9   employment_type                      1394131 non-null  ca

In [59]:
#data2.to_parquet(dir + 'user_spec.parquet', index=False)

In [11]:
%%time

data3 = pd.read_csv(dir + 'log_data.csv')
data3.head()

CPU times: user 20 s, sys: 3.57 s, total: 23.6 s
Wall time: 28.8 s


Unnamed: 0,user_id,event,timestamp,mp_os,mp_app_version,date_cd
0,576409,StartLoanApply,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
1,576409,ViewLoanApplyIntro,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
2,72878,EndLoanApply,2022-03-25 11:14:44,Android,3.8.4,2022-03-25
3,645317,OpenApp,2022-03-25 11:15:09,iOS,3.6.1,2022-03-25
4,645317,UseLoanManage,2022-03-25 11:15:11,iOS,3.6.1,2022-03-25


In [12]:
data3 = optimize_memory_usage(data3)

Memory Usage - 816.83 MB
Memory Usage Optimized - 408.43 MB
Reduced about 50.00 %


In [13]:
data3.head()

Unnamed: 0,user_id,event,timestamp,mp_os,mp_app_version,date_cd
0,576409,StartLoanApply,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
1,576409,ViewLoanApplyIntro,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
2,72878,EndLoanApply,2022-03-25 11:14:44,Android,3.8.4,2022-03-25
3,645317,OpenApp,2022-03-25 11:15:09,iOS,3.6.1,2022-03-25
4,645317,UseLoanManage,2022-03-25 11:15:11,iOS,3.6.1,2022-03-25


In [14]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17843993 entries, 0 to 17843992
Data columns (total 6 columns):
 #   Column          Dtype         
---  ------          -----         
 0   user_id         int32         
 1   event           category      
 2   timestamp       datetime64[ns]
 3   mp_os           category      
 4   mp_app_version  category      
 5   date_cd         datetime64[ns]
dtypes: category(3), datetime64[ns](2), int32(1)
memory usage: 408.4 MB


In [66]:
#data3.to_parquet(dir + 'log_data.parquet', index=False)

In [67]:
%%time

test = pd.read_parquet(dir + 'log_data.parquet')
test.head()

CPU times: user 2.08 s, sys: 1.59 s, total: 3.66 s
Wall time: 2.46 s


Unnamed: 0,user_id,event,timestamp,mp_os,mp_app_version,date_cd
0,576409,StartLoanApply,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
1,576409,ViewLoanApplyIntro,2022-03-25 11:12:09,Android,3.8.2,2022-03-25
2,72878,EndLoanApply,2022-03-25 11:14:44,Android,3.8.4,2022-03-25
3,645317,OpenApp,2022-03-25 11:15:09,iOS,3.6.1,2022-03-25
4,645317,UseLoanManage,2022-03-25 11:15:11,iOS,3.6.1,2022-03-25


In [69]:
%%time

test = pd.read_parquet(dir + 'loan_result.parquet')
test.head()

CPU times: user 903 ms, sys: 752 ms, total: 1.65 s
Wall time: 1.46 s


Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied
0,1748340,2022-06-07 13:05:41,7,191,42000000.0,13.6,
1,1748340,2022-06-07 13:05:41,25,169,24000000.0,17.9,
2,1748340,2022-06-07 13:05:41,2,7,24000000.0,18.5,
3,1748340,2022-06-07 13:05:41,4,268,29000000.0,10.8,
4,1748340,2022-06-07 13:05:41,11,118,5000000.0,16.4,


In [70]:
%%time

test = pd.read_parquet(dir + 'user_spec.parquet')
test.head()

CPU times: user 358 ms, sys: 200 ms, total: 558 ms
Wall time: 459 ms


Unnamed: 0,application_id,user_id,birth_year,gender,insert_time,credit_score,yearly_income,income_type,company_enter_month,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt
0,1249046,118218,1985.0,1.0,2022-06-07 06:28:18,660.0,108000000.0,PRIVATEBUSINESS,20151100.0,기타,자가,1000000.0,기타,0.0,,4.0,162000000.0
1,954900,553686,1968.0,1.0,2022-06-07 14:29:03,870.0,30000000.0,PRIVATEBUSINESS,20070200.0,정규직,기타가족소유,30000000.0,대환대출,0.0,,1.0,27000000.0
2,137274,59516,1997.0,1.0,2022-06-07 21:40:22,710.0,30000000.0,FREELANCER,20210900.0,기타,기타가족소유,10000000.0,생활비,0.0,,5.0,15000000.0
3,1570936,167320,1989.0,1.0,2022-06-07 09:40:27,820.0,62000000.0,EARNEDINCOME,20170100.0,정규직,자가,2000000.0,생활비,0.0,,7.0,344000000.0
4,967833,33400,2000.0,1.0,2022-06-07 08:55:07,630.0,36000000.0,EARNEDINCOME,20210900.0,정규직,기타가족소유,5000000.0,생활비,0.0,0.0,1.0,16000000.0
