# Pandas
> - 데이터 과학자를 위해 **테이블형태**로 데이터를 다룰 수 있게 해주는 패키지(python용 엑셀)
> - 기존 데이터처리 라이브러리인 numpy 대신 주로 사용
> - 일반인이 데이터분석을 접하기 쉽게 만들어준 결정적인 라이브러리
> - pandas만으로도 충분히 데이터 분석이 가능할 정도로 고수준의 함수들을 내장
> - 앞으로 진행하는 데이터분석 과정에서 주로 사용하게 될 데이터구조

## pandas 설치 및 import
    
> 콘솔창에서 실행 시  
**`pip install pandas`**  
**`conda install pandas`**
    
> 주피터 노트북으로 실행 시  
**`!pip install pandas`**
    
아나콘다 환경으로 python 환경설정 시 기본적으로 설치가 되어있음

In [None]:
# pandas 설치
# !pip install pandas

In [1]:
# numpy import
import numpy as np

# pandas import
import pandas as pd
# pd라는 닉네임은 많은 파이썬 유저들이 사용하고 있는 닉네임, 분석을 위한 필수는 아니지만 되도록이면 위와 같이 사용을 해줍시다.

pd.options.display.max_columns = 200
# 불러들이는 데이터에 맞춰 모든 컬럼을 확인 가능하도록 옵션값을 주었습니다.
pd.options.display.max_info_columns =200
# 그냥 실행 시키시고 지금 이해 못하셔도 좋습니다.

## DataFrame
> - 엑셀에 익숙한 사용자를 위해 제작 된 **테이블형태의 데이터 구조**  
> - 다양한 형태의 데이터를 받아 사용할 수 있으며 다양한 **통계, 시각화 함수를 제공**한다.  

실제 데이터를 불러들이고 값을 확인 해 보며 기본적인 pandas 사용법을 익혀보도록 하겠습니다.

### 데이터 불러오기
pandas는 다양한 데이터 파일 형태를 지원하며 주로 csv, xlsx, sql, json을 사용합니다.
    
> **`read_csv()`**  
**`read_excel()`**  
**`read_sql()`**  
**`read_json()`**  
**`json_normalize()`**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# DataFrame 의 약자로서 형식적으로 df 변수명을 사용한다.
# pandas패키지의 read_csv() 함수를 사용하여 loan1.csv 파일을 불러들여 데이터프레임을 만들고 df 이름의 변수로 저장
df = pd.read_csv('/content/drive/MyDrive/이어드림/advance/data/loan1.csv')
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82,0.0,Jul-2007,0.0,34.0,,8.0,0.0,5225,73.6,30.0,w,6442.28,6442.28,4493.81,4493.81,3557.72,936.09,0.0,0.0,0.0,Feb-2019,320.05,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,173110.0,0.0,2.0,0.0,2.0,23.0,12496.0,39.0,0.0,0.0,3949.0,45.0,7100.0,1.0,0.0,0.0,2.0,21639.0,1875.0,73.6,0.0,0.0,125.0,78.0,26.0,23.0,3.0,26.0,,21.0,,0.0,2.0,2.0,4.0,4.0,21.0,4.0,5.0,2.0,8.0,0.0,0.0,0.0,0.0,96.4,25.0,0.0,0.0,196130.0,17756.0,7100.0,31992.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0.0,Jun-2002,0.0,39.0,28.0,12.0,3.0,6953,51.9,38.0,w,2266.55,2266.55,1586.77,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,339028.0,0.0,4.0,0.0,3.0,22.0,76501.0,69.0,1.0,2.0,1628.0,65.0,13400.0,1.0,5.0,1.0,5.0,28252.0,808.0,82.4,0.0,0.0,164.0,186.0,7.0,7.0,2.0,7.0,39.0,7.0,39.0,0.0,4.0,7.0,4.0,10.0,19.0,7.0,17.0,7.0,12.0,0.0,0.0,0.0,1.0,97.3,75.0,0.0,3.0,416685.0,83454.0,4600.0,110595.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62,0.0,Aug-2002,0.0,44.0,,4.0,0.0,3401,97.2,12.0,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0.0,44.0,1,Individual,,,,0.0,0.0,51673.0,1.0,3.0,1.0,1.0,5.0,48272.0,48.0,0.0,0.0,3401.0,53.0,3500.0,2.0,0.0,3.0,1.0,12918.0,99.0,97.2,0.0,0.0,135.0,184.0,54.0,5.0,0.0,54.0,44.0,5.0,44.0,2.0,1.0,1.0,1.0,4.0,6.0,1.0,6.0,1.0,4.0,0.0,0.0,0.0,1.0,83.3,100.0,0.0,0.0,82176.0,51673.0,3500.0,78676.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88,0.0,Apr-2003,0.0,74.0,,8.0,0.0,12918,59.5,16.0,w,9090.87,9090.87,6391.53,6391.53,4909.13,1482.4,0.0,0.0,0.0,Feb-2019,457.75,Mar-2019,Feb-2019,0.0,74.0,1,Individual,,,,0.0,457.0,29103.0,1.0,1.0,1.0,2.0,4.0,16185.0,95.0,1.0,5.0,10153.0,75.0,21700.0,2.0,6.0,0.0,7.0,3638.0,7265.0,61.6,0.0,0.0,36.0,176.0,7.0,4.0,0.0,7.0,,16.0,74.0,1.0,2.0,4.0,3.0,4.0,4.0,7.0,12.0,4.0,8.0,0.0,0.0,0.0,2.0,93.8,33.3,0.0,0.0,38704.0,29103.0,18900.0,17004.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18,0.0,Jun-1999,0.0,,,5.0,0.0,4497,91.8,6.0,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,471.0,7202.0,0.0,1.0,0.0,0.0,71.0,2705.0,12.0,0.0,0.0,1483.0,27.0,4900.0,0.0,1.0,0.0,0.0,1440.0,403.0,91.8,0.0,0.0,149.0,222.0,43.0,43.0,0.0,43.0,,,,0.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0,4.0,5.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,26841.0,7202.0,4900.0,21941.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [14]:
df1= pd.read_excel('/content/drive/MyDrive/이어드림/advance/data/loan1.xlsx')

In [15]:
df1.head()

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,0,,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82,0,Jul-2007,0,34.0,,8,0,5225,73.6,30,w,6442.28,6442.28,4493.81,4493.81,3557.72,936.09,0.0,0.0,0.0,Feb-2019,320.05,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,173110,0,2,0,2,23.0,12496,39.0,0,0,3949,45.0,7100,1,0,0,2,21639.0,1875.0,73.6,0,0,125.0,78,26,23,3,26.0,,21.0,,0,2,2,4,4,21,4,5,2,8,0.0,0,0,0,96.4,25.0,0,0,196130,17756,7100,31992,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0,Jun-2002,0,39.0,28.0,12,3,6953,51.9,38,w,2266.55,2266.55,1586.77,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,339028,0,4,0,3,22.0,76501,69.0,1,2,1628,65.0,13400,1,5,1,5,28252.0,808.0,82.4,0,0,164.0,186,7,7,2,7.0,39.0,7.0,39.0,0,4,7,4,10,19,7,17,7,12,0.0,0,0,1,97.3,75.0,0,3,416685,83454,4600,110595,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,2,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62,0,Aug-2002,0,44.0,,4,0,3401,97.2,12,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0,44.0,1,Individual,,,,0,0,51673,1,3,1,1,5.0,48272,48.0,0,0,3401,53.0,3500,2,0,3,1,12918.0,99.0,97.2,0,0,135.0,184,54,5,0,54.0,44.0,5.0,44.0,2,1,1,1,4,6,1,6,1,4,0.0,0,0,1,83.3,100.0,0,0,82176,51673,3500,78676,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,3,,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88,0,Apr-2003,0,74.0,,8,0,12918,59.5,16,w,9090.87,9090.87,6391.53,6391.53,4909.13,1482.4,0.0,0.0,0.0,Feb-2019,457.75,Mar-2019,Feb-2019,0,74.0,1,Individual,,,,0,457,29103,1,1,1,2,4.0,16185,95.0,1,5,10153,75.0,21700,2,6,0,7,3638.0,7265.0,61.6,0,0,36.0,176,7,4,0,7.0,,16.0,74.0,1,2,4,3,4,4,7,12,4,8,0.0,0,0,2,93.8,33.3,0,0,38704,29103,18900,17004,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,4,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18,0,Jun-1999,0,,,5,0,4497,91.8,6,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0,,1,Individual,,,,0,471,7202,0,1,0,0,71.0,2705,12.0,0,0,1483,27.0,4900,0,1,0,0,1440.0,403.0,91.8,0,0,149.0,222,43,43,0,43.0,,,,0,4,4,4,4,2,4,4,4,5,0.0,0,0,0,100.0,100.0,0,0,26841,7202,4900,21941,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [10]:
# 만약 모듈을 찾을 수 없는 오류가 발생한다면 추가 모듈 설치
# 필요 모듈 import
!pip install xlrd
!pip install openpyxl
!pip install pyxlsb

Collecting pyxlsb
  Downloading pyxlsb-1.0.9-py2.py3-none-any.whl (23 kB)
Installing collected packages: pyxlsb
Successfully installed pyxlsb-1.0.9


In [13]:
# 엑셀파일에 시트에 따라 데이터 구분이 지어진 경우 시트별로 데이터프레임 제작 가능
# 다른 엑셀파일형식을 가져올 때 engine파라메터 추가해주시면 됩니다.
df1 = pd.read_excel('/content/drive/MyDrive/이어드림/advance/data/loan1.xlsx', 
                    sheet_name='구매영수증상세+상품마스터포함', 
                    engine='pyxlsb',
                    encoding='utf-8') # 윈도우의 경우 cp949

TypeError: ignored

In [None]:
# 데이터프레임 확인


In [None]:
# 참고! 실습은 하지 않습니다만 쿼리를 사용하여 데이터베이스로부터 데이터프레임을 만드는 것도 가능합니다.
# 데이터베이스로 부터 자료 읽기

# 필요한 모듈 추가 설치 - 각 데이터베이스 별로 다릅니다.
# !pip install pymysql

# sql 모듈 로드하기
# import pymysql
# mysql, mariadb, sqlite, postgresql, ms-sql, oracle, mongodb

# 접속하기
# 접속방법 또한 DB 종류에 따라 다릅니다.
# con = pymysql.connect(host='db서버주소', port=3306, user='id', passwd='pwd', db='dbname')

# query 만들기
# query = 'select * from samples'

# 자료 불러오기
# data = pd.read_sql(query, con=con)

### 데이터 저장하기
불러들인 혹은 작업을 마친 데이터프레임을 다양한 파일형태로 저장이 가능합니다.
    
> **`to_csv()`**  
**`to_excel()`**  
**`to_sql()`**

In [19]:
# index=False 파라메터는 기존 데이터프레임의 인덱스를 무시하고 저장
# 경로 : 주피터 노트북으로 실행시 hdd에 저장
df.to_csv('save_test.csv.gz', index=False) # . gz용량을 줄여서 저장

In [20]:
df_test = pd.read_csv('save_test.csv.gz')

In [21]:
df_test.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82,0.0,Jul-2007,0.0,34.0,,8.0,0.0,5225,73.6,30.0,w,6442.28,6442.28,4493.81,4493.81,3557.72,936.09,0.0,0.0,0.0,Feb-2019,320.05,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,173110.0,0.0,2.0,0.0,2.0,23.0,12496.0,39.0,0.0,0.0,3949.0,45.0,7100.0,1.0,0.0,0.0,2.0,21639.0,1875.0,73.6,0.0,0.0,125.0,78.0,26.0,23.0,3.0,26.0,,21.0,,0.0,2.0,2.0,4.0,4.0,21.0,4.0,5.0,2.0,8.0,0.0,0.0,0.0,0.0,96.4,25.0,0.0,0.0,196130.0,17756.0,7100.0,31992.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0.0,Jun-2002,0.0,39.0,28.0,12.0,3.0,6953,51.9,38.0,w,2266.55,2266.55,1586.77,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,339028.0,0.0,4.0,0.0,3.0,22.0,76501.0,69.0,1.0,2.0,1628.0,65.0,13400.0,1.0,5.0,1.0,5.0,28252.0,808.0,82.4,0.0,0.0,164.0,186.0,7.0,7.0,2.0,7.0,39.0,7.0,39.0,0.0,4.0,7.0,4.0,10.0,19.0,7.0,17.0,7.0,12.0,0.0,0.0,0.0,1.0,97.3,75.0,0.0,3.0,416685.0,83454.0,4600.0,110595.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62,0.0,Aug-2002,0.0,44.0,,4.0,0.0,3401,97.2,12.0,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0.0,44.0,1,Individual,,,,0.0,0.0,51673.0,1.0,3.0,1.0,1.0,5.0,48272.0,48.0,0.0,0.0,3401.0,53.0,3500.0,2.0,0.0,3.0,1.0,12918.0,99.0,97.2,0.0,0.0,135.0,184.0,54.0,5.0,0.0,54.0,44.0,5.0,44.0,2.0,1.0,1.0,1.0,4.0,6.0,1.0,6.0,1.0,4.0,0.0,0.0,0.0,1.0,83.3,100.0,0.0,0.0,82176.0,51673.0,3500.0,78676.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88,0.0,Apr-2003,0.0,74.0,,8.0,0.0,12918,59.5,16.0,w,9090.87,9090.87,6391.53,6391.53,4909.13,1482.4,0.0,0.0,0.0,Feb-2019,457.75,Mar-2019,Feb-2019,0.0,74.0,1,Individual,,,,0.0,457.0,29103.0,1.0,1.0,1.0,2.0,4.0,16185.0,95.0,1.0,5.0,10153.0,75.0,21700.0,2.0,6.0,0.0,7.0,3638.0,7265.0,61.6,0.0,0.0,36.0,176.0,7.0,4.0,0.0,7.0,,16.0,74.0,1.0,2.0,4.0,3.0,4.0,4.0,7.0,12.0,4.0,8.0,0.0,0.0,0.0,2.0,93.8,33.3,0.0,0.0,38704.0,29103.0,18900.0,17004.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18,0.0,Jun-1999,0.0,,,5.0,0.0,4497,91.8,6.0,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,471.0,7202.0,0.0,1.0,0.0,0.0,71.0,2705.0,12.0,0.0,0.0,1483.0,27.0,4900.0,0.0,1.0,0.0,0.0,1440.0,403.0,91.8,0.0,0.0,149.0,222.0,43.0,43.0,0.0,43.0,,,,0.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0,4.0,5.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,26841.0,7202.0,4900.0,21941.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


### 사용 데이터 간략 설명
> 미국 핀테크 회사인 lending club의 대출 데이터베이스  
클라우드펀딩과 대출을 결합한 핀테크의 시초라고 부를 수 있는 회사  
방대한 양의 대출정보를 공개하면서 금융정보분석에도 기여한 공이 큰 데이터  
2007 ~ 2015 년 대출정보 및 개인정보를 담고 있음  
226만건, 145항목 정보를 담고있음  
실습데이터는 이 중 4만건을 추출한 데이터를 사용합니다.  

데이터출처: https://www.kaggle.com/wordsforthewise/lending-club

### 데이터 살펴보기

In [23]:
# 데이터를 불러들인 후 가장 처음 하는 작업
# 데이터의 구조, 형태 파악하기
# 데이터의 첫 5개 데이터 하나(샘플, 인스턴스) 확인하기
'''
loan_amnt= 총 대출 금액  
funded_amnt= 갚은 금액   
term = 만기 일자   
int_rate = 이자율   
grade = 신용등급   
emp_title= 직업명   
emp_length= 근속년수   
home_ownership = 주택 보유 여부    
issue_d= 발행 날짜    
loan_status = 갚고 있는지 완료되었는지
'''
df.head()
# 10개를 확인하려면?
df.head(10)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82,0.0,Jul-2007,0.0,34.0,,8.0,0.0,5225,73.6,30.0,w,6442.28,6442.28,4493.81,4493.81,3557.72,936.09,0.0,0.0,0.0,Feb-2019,320.05,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,173110.0,0.0,2.0,0.0,2.0,23.0,12496.0,39.0,0.0,0.0,3949.0,45.0,7100.0,1.0,0.0,0.0,2.0,21639.0,1875.0,73.6,0.0,0.0,125.0,78.0,26.0,23.0,3.0,26.0,,21.0,,0.0,2.0,2.0,4.0,4.0,21.0,4.0,5.0,2.0,8.0,0.0,0.0,0.0,0.0,96.4,25.0,0.0,0.0,196130.0,17756.0,7100.0,31992.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0.0,Jun-2002,0.0,39.0,28.0,12.0,3.0,6953,51.9,38.0,w,2266.55,2266.55,1586.77,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,339028.0,0.0,4.0,0.0,3.0,22.0,76501.0,69.0,1.0,2.0,1628.0,65.0,13400.0,1.0,5.0,1.0,5.0,28252.0,808.0,82.4,0.0,0.0,164.0,186.0,7.0,7.0,2.0,7.0,39.0,7.0,39.0,0.0,4.0,7.0,4.0,10.0,19.0,7.0,17.0,7.0,12.0,0.0,0.0,0.0,1.0,97.3,75.0,0.0,3.0,416685.0,83454.0,4600.0,110595.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62,0.0,Aug-2002,0.0,44.0,,4.0,0.0,3401,97.2,12.0,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0.0,44.0,1,Individual,,,,0.0,0.0,51673.0,1.0,3.0,1.0,1.0,5.0,48272.0,48.0,0.0,0.0,3401.0,53.0,3500.0,2.0,0.0,3.0,1.0,12918.0,99.0,97.2,0.0,0.0,135.0,184.0,54.0,5.0,0.0,54.0,44.0,5.0,44.0,2.0,1.0,1.0,1.0,4.0,6.0,1.0,6.0,1.0,4.0,0.0,0.0,0.0,1.0,83.3,100.0,0.0,0.0,82176.0,51673.0,3500.0,78676.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88,0.0,Apr-2003,0.0,74.0,,8.0,0.0,12918,59.5,16.0,w,9090.87,9090.87,6391.53,6391.53,4909.13,1482.4,0.0,0.0,0.0,Feb-2019,457.75,Mar-2019,Feb-2019,0.0,74.0,1,Individual,,,,0.0,457.0,29103.0,1.0,1.0,1.0,2.0,4.0,16185.0,95.0,1.0,5.0,10153.0,75.0,21700.0,2.0,6.0,0.0,7.0,3638.0,7265.0,61.6,0.0,0.0,36.0,176.0,7.0,4.0,0.0,7.0,,16.0,74.0,1.0,2.0,4.0,3.0,4.0,4.0,7.0,12.0,4.0,8.0,0.0,0.0,0.0,2.0,93.8,33.3,0.0,0.0,38704.0,29103.0,18900.0,17004.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18,0.0,Jun-1999,0.0,,,5.0,0.0,4497,91.8,6.0,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,471.0,7202.0,0.0,1.0,0.0,0.0,71.0,2705.0,12.0,0.0,0.0,1483.0,27.0,4900.0,0.0,1.0,0.0,0.0,1440.0,403.0,91.8,0.0,0.0,149.0,222.0,43.0,43.0,0.0,43.0,,,,0.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0,4.0,5.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,26841.0,7202.0,4900.0,21941.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
5,,,36000,36000,36000.0,60 months,14.08,839.16,C,C3,NDT III,10+ years,RENT,74000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,920xx,CA,21.46,0.0,Apr-2001,1.0,39.0,,5.0,0.0,22328,83.3,19.0,w,29698.8,29698.8,11691.92,11691.92,6301.2,5390.72,0.0,0.0,0.0,Feb-2019,839.16,Mar-2019,Feb-2019,0.0,58.0,1,Individual,,,,0.0,0.0,38082.0,0.0,2.0,0.0,0.0,29.0,15754.0,46.0,0.0,0.0,18310.0,62.0,26800.0,0.0,3.0,2.0,0.0,7616.0,1672.0,93.0,0.0,0.0,158.0,200.0,65.0,29.0,0.0,70.0,63.0,4.0,58.0,1.0,2.0,2.0,2.0,6.0,9.0,3.0,10.0,2.0,5.0,0.0,0.0,0.0,0.0,83.3,100.0,0.0,0.0,61280.0,38082.0,24000.0,34480.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
6,,,20000,20000,20000.0,36 months,9.93,644.69,B,B2,Supervisory program analyst,10+ years,MORTGAGE,140000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,750xx,TX,7.76,2.0,May-1988,2.0,,,9.0,0.0,21374,62.0,24.0,w,12933.83,12933.83,9035.82,9035.82,7066.17,1937.42,32.23,0.0,0.0,Feb-2019,644.69,Mar-2019,Feb-2019,0.0,17.0,1,Individual,,,,0.0,0.0,159398.0,2.0,1.0,0.0,1.0,16.0,19645.0,86.0,4.0,6.0,18596.0,62.0,34700.0,0.0,0.0,4.0,8.0,17710.0,10933.0,63.0,0.0,0.0,136.0,355.0,3.0,3.0,1.0,3.0,,3.0,,1.0,3.0,7.0,5.0,7.0,11.0,7.0,12.0,4.0,9.0,,0.0,2.0,5.0,71.0,33.3,0.0,0.0,179419.0,41019.0,32800.0,22835.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
7,,,3200,3200,3200.0,36 months,10.91,104.63,B,B4,Commercial Property Manager,< 1 year,MORTGAGE,48000.0,Not Verified,Dec-2017,Current,n,,,other,Other,936xx,CA,16.66,0.0,Mar-2006,0.0,,,9.0,0.0,12748,91.7,11.0,w,2077.66,2077.66,1472.58,1472.58,1122.34,350.24,0.0,0.0,0.0,Feb-2019,104.63,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,188.0,190238.0,0.0,1.0,0.0,1.0,15.0,13772.0,87.0,1.0,2.0,3987.0,89.0,13900.0,0.0,0.0,0.0,3.0,21138.0,1084.0,91.0,0.0,0.0,15.0,141.0,7.0,7.0,2.0,7.0,,,,0.0,6.0,7.0,6.0,7.0,1.0,7.0,8.0,7.0,9.0,0.0,0.0,0.0,1.0,100.0,83.3,0.0,0.0,211525.0,26520.0,12000.0,15908.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
8,,,14500,14500,14500.0,36 months,16.02,509.93,C,C5,Equipment technician,< 1 year,MORTGAGE,38000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,765xx,TX,14.28,0.0,Oct-2008,0.0,,,6.0,0.0,1381,21.6,7.0,w,9613.67,9613.67,7201.93,7201.93,4886.33,2315.6,0.0,0.0,0.0,Feb-2019,509.93,Mar-2019,Feb-2019,0.0,,1,Joint App,61000.0,24.2,Not Verified,0.0,0.0,138605.0,1.0,1.0,1.0,1.0,7.0,22122.0,93.0,1.0,2.0,1381.0,78.0,6400.0,2.0,1.0,4.0,4.0,23101.0,1119.0,55.2,0.0,0.0,110.0,61.0,3.0,3.0,1.0,3.0,,7.0,,0.0,1.0,1.0,2.0,2.0,2.0,4.0,4.0,1.0,6.0,0.0,0.0,0.0,2.0,100.0,50.0,0.0,0.0,147563.0,23503.0,2500.0,23691.0,13179.0,Aug-2009,0.0,1.0,8.0,85.4,4.0,4.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
9,,,35000,35000,35000.0,36 months,6.08,1066.04,A,A2,,,OWN,76000.0,Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,388xx,MS,34.28,0.0,Aug-1994,1.0,,,23.0,0.0,47145,48.0,42.0,w,22139.93,22139.93,14900.92,14900.92,12860.07,2040.85,0.0,0.0,0.0,Feb-2019,1066.04,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,170361.0,1.0,2.0,0.0,0.0,26.0,28292.0,64.0,2.0,2.0,11128.0,53.0,98200.0,0.0,4.0,1.0,2.0,7744.0,27309.0,63.2,0.0,0.0,148.0,280.0,6.0,6.0,4.0,6.0,,6.0,,0.0,13.0,14.0,14.0,16.0,9.0,20.0,29.0,14.0,23.0,0.0,0.0,0.0,2.0,100.0,35.7,0.0,0.0,252224.0,75437.0,74300.0,44553.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,


In [24]:
# 데이터의 마지막 5개 샘플 확인하기
# 데이터가 잘 가져왔는지 확인 할 때 보통 씁니다.
df.tail()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
19995,,,10800,10800,10800.0,36 months,9.44,345.66,B,B1,Partner,3 years,MORTGAGE,240000.0,Not Verified,Nov-2017,Current,n,,,major_purchase,Major purchase,370xx,TN,9.25,0.0,Oct-2005,1.0,42.0,,6.0,0.0,128754,68.4,10.0,w,6667.33,6667.33,5173.57,5173.57,4132.67,1040.9,0.0,0.0,0.0,Feb-2019,345.66,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,414761.0,1.0,2.0,1.0,1.0,2.0,74155.0,,1.0,1.0,911.0,68.0,152300.0,1.0,1.0,4.0,2.0,69127.0,726.0,68.4,0.0,0.0,145.0,107.0,10.0,2.0,2.0,89.0,42.0,0.0,42.0,0.0,2.0,3.0,2.0,2.0,5.0,3.0,3.0,3.0,6.0,0.0,0.0,0.0,2.0,77.8,0.0,0.0,0.0,469534.0,202909.0,2300.0,84548.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19996,,,14000,14000,14000.0,60 months,14.08,326.34,C,C3,Senior Property Manager,10+ years,OWN,53300.0,Verified,Nov-2017,Current,n,,,major_purchase,Major purchase,481xx,MI,12.95,0.0,Mar-1987,1.0,27.0,,7.0,0.0,1536,30.7,17.0,w,11358.71,11358.71,4873.2,4873.2,2641.29,2231.91,0.0,0.0,0.0,Feb-2019,326.34,Mar-2019,Feb-2019,0.0,40.0,1,Individual,,,,0.0,0.0,85489.0,0.0,1.0,1.0,1.0,8.0,12118.0,84.0,0.0,1.0,0.0,71.0,5000.0,2.0,0.0,2.0,2.0,14248.0,,,0.0,0.0,140.0,368.0,20.0,8.0,1.0,,40.0,0.0,40.0,3.0,0.0,2.0,0.0,3.0,5.0,5.0,11.0,2.0,7.0,0.0,0.0,0.0,1.0,52.9,,0.0,0.0,105375.0,13654.0,0.0,14375.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19997,,,35000,35000,35000.0,60 months,12.62,789.57,C,C1,Mechanic engineer,7 years,RENT,85000.0,Source Verified,Nov-2017,Fully Paid,n,,,credit_card,Credit card refinancing,030xx,NH,16.77,0.0,Jul-2002,2.0,64.0,,11.0,0.0,23115,43.6,29.0,w,0.0,0.0,38737.465857,38737.47,35000.0,3737.47,0.0,0.0,0.0,Oct-2018,31680.42,,Oct-2018,1.0,,1,Joint App,160000.0,18.86,Source Verified,0.0,468.0,47500.0,0.0,1.0,0.0,0.0,25.0,24385.0,,0.0,0.0,9076.0,44.0,49500.0,0.0,0.0,2.0,0.0,4750.0,24385.0,45.7,0.0,0.0,184.0,91.0,27.0,25.0,0.0,27.0,,0.0,,0.0,4.0,4.0,8.0,17.0,8.0,10.0,21.0,4.0,11.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,87549.0,47500.0,47500.0,38049.0,40609.0,Oct-2006,1.0,0.0,34.0,45.9,17.0,30.0,0.0,0.0,69.0,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19998,,,35225,35225,35225.0,60 months,19.03,914.34,D,D3,Teacher,8 years,MORTGAGE,42000.0,Source Verified,Nov-2017,Current,n,,,debt_consolidation,Debt consolidation,532xx,WI,32.8,0.0,May-2003,0.0,65.0,,15.0,0.0,10432,60.0,30.0,w,29254.0,29254.0,13640.62,13640.62,5971.0,7669.62,0.0,0.0,0.0,Feb-2019,914.34,Mar-2019,Feb-2019,0.0,,1,Joint App,82000.0,25.02,Source Verified,0.0,0.0,162765.0,0.0,7.0,0.0,0.0,46.0,19208.0,35.0,0.0,0.0,4515.0,41.0,17400.0,0.0,0.0,0.0,0.0,11626.0,5315.0,65.5,0.0,0.0,169.0,174.0,52.0,40.0,1.0,89.0,,,65.0,0.0,3.0,4.0,4.0,7.0,10.0,6.0,17.0,4.0,15.0,0.0,0.0,0.0,0.0,96.4,50.0,0.0,0.0,213708.0,29838.0,15400.0,54560.0,29018.0,May-2004,0.0,1.0,11.0,56.0,1.0,11.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19999,,,3500,3500,3500.0,36 months,17.09,124.95,D,D1,Director of Design,10+ years,RENT,87000.0,Not Verified,Nov-2017,Current,n,,,debt_consolidation,Debt consolidation,606xx,IL,20.72,0.0,Nov-1999,0.0,,,8.0,0.0,19505,79.3,11.0,f,2253.89,2253.89,1867.6,1867.6,1246.11,621.49,0.0,0.0,0.0,Feb-2019,124.95,Mar-2019,Feb-2019,0.0,,1,Joint App,137000.0,24.69,Not Verified,0.0,0.0,44460.0,0.0,3.0,1.0,3.0,7.0,17149.0,68.0,0.0,1.0,10383.0,77.0,24600.0,2.0,0.0,1.0,4.0,5558.0,2995.0,86.7,0.0,0.0,139.0,136.0,16.0,7.0,0.0,16.0,,7.0,,0.0,3.0,3.0,3.0,3.0,6.0,4.0,4.0,3.0,8.0,0.0,0.0,0.0,1.0,100.0,100.0,0.0,0.0,58057.0,44460.0,22500.0,25249.0,69960.0,Aug-1992,1.0,0.0,7.0,94.9,2.0,8.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [25]:
# 데이터의 갯수를 살펴봅니다
len(df)

20000

In [26]:
# 데이터의 전반적인 정보를 확인합니다.
df.info()
# dtype 정보에서는 각 컬럼별 데이터 타입을 확인 할 수 있습니다.
# object == str 이라고 생각하셔도 무방합니다.
# verbose, null_counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 145 columns):
 #    Column                                      Non-Null Count  Dtype  
---   ------                                      --------------  -----  
 0    id                                          0 non-null      float64
 1    member_id                                   0 non-null      float64
 2    loan_amnt                                   20000 non-null  int64  
 3    funded_amnt                                 20000 non-null  int64  
 4    funded_amnt_inv                             20000 non-null  float64
 5    term                                        20000 non-null  object 
 6    int_rate                                    20000 non-null  float64
 7    installment                                 20000 non-null  float64
 8    grade                                       20000 non-null  object 
 9    sub_grade                                   20000 non-null  object 
 1

In [27]:
# 데이터의 기초통계량을 확인합니다.
df.describe()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,url,desc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,annual_inc_joint,dti_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,0.0,0.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,0.0,0.0,19961.0,20000.0,20000.0,8992.0,2914.0,20000.0,20000.0,20000.0,19963.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,4862.0,20000.0,2792.0,2792.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,19242.0,20000.0,16740.0,20000.0,20000.0,20000.0,19994.0,20000.0,20000.0,20000.0,20000.0,20000.0,19997.0,19653.0,19646.0,20000.0,20000.0,19242.0,20000.0,20000.0,20000.0,20000.0,19672.0,4076.0,17495.0,5889.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,18819.0,20000.0,20000.0,20000.0,20000.0,19651.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,2792.0,2792.0,2792.0,2792.0,2745.0,2792.0,2792.0,2792.0,2792.0,998.0,64.0,64.0,64.0,64.0,50.0,64.0,64.0,61.0,61.0,61.0
mean,,,15382.56875,15382.56875,15373.741922,12.606765,446.811954,78429.26,,,19.214833,0.2464,0.4965,36.956962,77.992793,11.0579,0.16525,15415.68815,43.516676,22.1278,8377.290489,8372.905008,8031.681974,8026.753615,6232.746169,1757.637769,1.228706,40.06933,6.149841,2603.487688,0.02175,45.714726,1.0,121185.3,19.649828,0.00055,185.20045,141077.4,0.90875,2.6209,0.6365,1.5103,21.616204,33969.21395,67.57724,1.20755,2.5691,5528.2165,53.787686,36440.10545,1.0009,1.4222,1.9396,4.34895,14054.194879,14526.662291,49.069719,0.0088,11.2964,122.639487,179.0936,15.3664,8.74565,1.35425,25.995425,40.280913,7.177937,37.489896,0.4677,3.4482,5.10545,4.70355,6.9992,7.90935,7.79135,12.6573,5.029,11.0235,0.0,0.0005,0.072,1.97045,94.234615,32.322233,0.1297,0.03535,181022.8,49500.36145,25365.9079,44365.02,32759.315544,0.751791,1.413324,11.49033,59.74204,3.193768,12.423711,0.058381,0.09563,35.486974,3.0,198.6075,3.0,14.5625,582.3612,14786.18875,231.527187,8129.44459,53.103443,17.327869
std,,,10011.645757,10011.645757,10010.591629,4.929089,283.619373,83630.96,,,21.957008,0.834938,0.78238,22.055265,24.54357,5.687958,0.464277,22845.011859,25.409374,11.731788,8291.549106,8290.262906,6951.524721,6949.741126,6546.212512,1570.050467,9.534786,367.534805,59.740728,5953.233604,0.179105,21.840407,0.0,62371.47,8.079497,0.02549,1186.936786,168300.7,1.133088,2.845521,0.896944,1.533529,26.840505,43478.989027,24.003849,1.473052,2.503485,5358.375953,21.717709,37020.542365,1.451518,2.570676,2.273392,3.161263,18258.768688,19228.843119,29.377757,0.111907,725.679605,54.493694,100.822641,19.217498,9.712897,1.720931,34.820223,22.455427,5.94676,22.10069,1.379511,2.343905,3.300133,3.168765,4.472348,7.099426,4.70687,7.691516,3.145831,5.674677,0.0,0.02449,0.562612,1.840773,9.439437,35.301888,0.344505,0.299006,189996.7,51000.464205,25063.607841,47243.95,28041.003357,1.075274,1.698211,6.736632,25.905343,3.597651,8.045636,0.391363,0.445313,24.28697,0.0,146.832687,0.0,8.620527,414.623586,8707.436533,223.560399,5727.283541,10.40876,6.922719
min,,,1000.0,1000.0,1000.0,5.32,7.61,0.0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22700.0,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,10.54,3.0,0.0,31.62,1012.11,0.24,645.0,40.0,1.0
25%,,,8000.0,8000.0,7975.0,9.44,238.17,45000.0,,,10.85,0.0,0.0,19.0,63.0,7.0,0.0,4739.0,23.4,14.0,8.1825,8.1825,3590.55,3587.205,2302.92,617.76,0.0,0.0,0.0,268.09,0.0,29.0,1.0,82668.25,13.82,0.0,0.0,25257.75,0.0,1.0,0.0,0.0,7.0,7449.75,53.0,0.0,1.0,2014.0,39.0,14800.0,0.0,0.0,0.0,2.0,2850.0,2818.0,24.5,0.0,0.0,87.0,113.0,4.0,3.0,0.0,6.0,22.0,2.0,20.0,0.0,2.0,3.0,3.0,4.0,3.0,4.0,7.0,3.0,7.0,0.0,0.0,0.0,1.0,91.7,0.0,0.0,0.0,49756.75,18440.5,9000.0,14534.75,14405.0,0.0,0.0,7.0,41.5,1.0,7.0,0.0,0.0,14.0,3.0,82.605,3.0,7.0,249.9375,7370.455,52.7775,3542.96,45.0,14.0
50%,,,12500.0,12500.0,12500.0,11.99,369.0,65000.0,,,17.26,0.0,0.0,34.0,81.0,10.0,0.0,9867.5,41.5,20.0,6391.41,6391.4,5941.78,5938.74,3961.91,1265.8,0.0,0.0,0.0,466.2,0.0,46.0,1.0,109833.0,19.2,0.0,0.0,70028.5,1.0,2.0,0.0,1.0,14.0,21917.0,70.0,1.0,2.0,4167.0,55.0,26500.0,1.0,0.0,1.0,4.0,7038.0,7810.0,47.65,0.0,0.0,129.0,161.0,9.0,6.0,1.0,14.0,38.0,6.0,35.0,0.0,3.0,4.0,4.0,6.0,6.0,7.0,11.0,4.0,10.0,0.0,0.0,0.0,2.0,100.0,22.2,0.0,0.0,111175.5,35654.0,18100.0,32991.5,25584.0,0.0,1.0,10.0,62.1,2.0,11.0,0.0,0.0,33.5,3.0,147.76,3.0,15.5,478.32,13231.04,159.82,6422.0,50.0,18.0
75%,,,20156.25,20156.25,20156.25,15.05,605.67,95000.0,,,24.58,0.0,1.0,54.0,97.0,14.0,0.0,18336.0,62.0,28.0,12951.92,12950.1375,10284.0925,10278.1425,7315.54,2452.35,0.0,0.0,0.0,922.48,0.0,63.0,1.0,145000.0,25.3,0.0,0.0,211228.8,1.0,3.0,1.0,2.0,24.0,44218.25,85.0,2.0,4.0,7353.5,69.0,45900.0,1.0,2.0,3.0,6.0,19499.0,18626.0,74.0,0.0,0.0,153.0,231.0,19.0,11.0,2.0,30.0,57.0,11.0,52.0,0.0,5.0,7.0,6.0,9.0,10.0,10.0,16.0,7.0,14.0,0.0,0.0,0.0,3.0,100.0,50.0,0.0,0.0,260668.8,63291.75,33100.0,60072.25,42740.5,1.0,2.0,15.0,80.1,4.0,16.0,0.0,0.0,54.0,3.0,287.89,3.0,22.0,862.4325,21705.2425,332.5675,12162.66,60.0,24.0
max,,,40000.0,40000.0,40000.0,30.99,1618.03,6500031.0,,,999.0,36.0,5.0,150.0,120.0,56.0,18.0,629372.0,125.0,96.0,35250.64,35250.64,47101.212295,47101.21,40000.0,14105.79,279.15,9625.0,2040.0,40747.67,9.0,150.0,1.0,1058000.0,39.77,2.0,57638.0,2460868.0,11.0,35.0,6.0,16.0,374.0,827988.0,199.0,18.0,37.0,146863.0,160.0,667100.0,18.0,33.0,33.0,37.0,379822.0,281029.0,136.7,5.0,65000.0,501.0,785.0,279.0,197.0,23.0,539.0,150.0,24.0,150.0,31.0,24.0,33.0,38.0,54.0,67.0,53.0,90.0,30.0,56.0,0.0,2.0,36.0,19.0,100.0,100.0,4.0,18.0,2531600.0,897835.0,320000.0,1310923.0,324858.0,6.0,13.0,51.0,159.3,39.0,62.0,6.0,6.0,109.0,3.0,629.82,3.0,30.0,1889.46,32738.12,979.36,23000.0,94.33,24.0


In [28]:
# numpy 함수로 데이터 shape 확인
df.shape

(20000, 145)

In [29]:
# 인덱스
df.index

RangeIndex(start=0, stop=20000, step=1)

In [None]:
for idx in df.index:
    print(idx)

In [None]:
# 컬럼
for col in df.columns:
    print(col)

In [None]:
# 컬럼 고유값 출력

데이터셋을 살펴 본 결과 정체를 알 수 없는 많은 컬럼이 있는 걸 확인했고, 

50000개의 샘플이 불러들여진 것을 확인 할 수 있었습니다.

추가로 데이터 중간 중간 비어있는 값도 있는 것을 확인했습니다.

### 데이터접근 (인덱싱, 슬라이싱, 샘플링)

In [35]:
# 첫 샘플 혹은 레코드(대출건)에 대한 데이터를 살펴보겠습니다.
# 인덱스넘버로 데이터에 접근하는 .iloc[색인]
# 각 컬럼이나, 행단위 접근했을 때 출력되는 벡터 데이터를 Serise(시리즈) 라고 하는 자료구조
# index, values로 각각의 속성에 접근 가능
df.iloc[0] # 0번 인덱스 데이터
df.iloc[0].values

array([nan, nan, 10000, 10000, 10000.0, ' 36 months', 9.44, 320.05, 'B',
       'B1', 'mechanic', '6 years', 'MORTGAGE', 80000.0, 'Not Verified',
       'Dec-2017', 'Current', 'n', nan, nan, 'credit_card',
       'Credit card refinancing', '762xx', 'TX', 14.82, 0.0, 'Jul-2007',
       0.0, 34.0, nan, 8.0, 0.0, 5225, 73.6, 30.0, 'w', 6442.28, 6442.28,
       4493.81, 4493.81, 3557.72, 936.09, 0.0, 0.0, 0.0, 'Feb-2019',
       320.05, 'Mar-2019', 'Feb-2019', 0.0, nan, 1, 'Individual', nan,
       nan, nan, 0.0, 0.0, 173110.0, 0.0, 2.0, 0.0, 2.0, 23.0, 12496.0,
       39.0, 0.0, 0.0, 3949.0, 45.0, 7100.0, 1.0, 0.0, 0.0, 2.0, 21639.0,
       1875.0, 73.6, 0.0, 0.0, 125.0, 78.0, 26.0, 23.0, 3.0, 26.0, nan,
       21.0, nan, 0.0, 2.0, 2.0, 4.0, 4.0, 21.0, 4.0, 5.0, 2.0, 8.0, 0.0,
       0.0, 0.0, 0.0, 96.4, 25.0, 0.0, 0.0, 196130.0, 17756.0, 7100.0,
       31992.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       'N', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
 

In [37]:
# 10번 인덱스 부터 20번 인덱스 샘플 접근
# start, end+1, step
df.iloc[10:20:2]

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
10,,,40000,40000,40000.0,60 months,16.02,973.15,C,C5,IT Manager - Business Process,10+ years,OWN,140000.0,Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,292xx,SC,31.79,1.0,Aug-1999,0.0,3.0,,11.0,0.0,34632,59.5,52.0,w,33288.83,33288.83,13602.03,13602.03,6711.17,6890.86,0.0,0.0,0.0,Feb-2019,973.15,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,59.0,81346.0,0.0,3.0,0.0,0.0,29.0,46714.0,50.0,0.0,1.0,19100.0,53.0,104500.0,0.0,0.0,1.0,1.0,8135.0,63703.0,57.9,0.0,0.0,220.0,213.0,13.0,13.0,0.0,13.0,,9.0,3.0,0.0,2.0,6.0,2.0,17.0,22.0,8.0,30.0,6.0,11.0,0.0,0.0,0.0,0.0,98.0,0.0,0.0,0.0,197500.0,81346.0,88000.0,93000.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
12,,,6025,6025,6025.0,36 months,17.09,215.08,D,D1,Account Manager,5 years,RENT,62000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,010xx,MA,14.65,0.0,Aug-2004,1.0,,,9.0,0.0,6363,37.4,15.0,w,750.33,750.33,6002.55,6002.55,5274.67,727.88,0.0,0.0,0.0,Feb-2019,215.08,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,578.0,21201.0,0.0,3.0,1.0,2.0,9.0,14838.0,57.0,1.0,3.0,1997.0,49.0,17000.0,2.0,0.0,3.0,5.0,2356.0,10637.0,37.4,0.0,0.0,157.0,160.0,8.0,8.0,0.0,8.0,,2.0,,0.0,5.0,5.0,5.0,5.0,8.0,5.0,7.0,5.0,9.0,0.0,0.0,0.0,2.0,100.0,40.0,0.0,0.0,42956.0,21201.0,17000.0,25956.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
14,,,10200,10200,10200.0,36 months,11.99,338.74,B,B5,Machine operator,4 years,RENT,35000.0,Source Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,635xx,MO,15.26,0.0,Dec-2008,1.0,,,6.0,0.0,4238,26.2,19.0,w,6662.23,6662.23,4745.71,4745.71,3537.77,1191.0,16.94,0.0,0.0,Feb-2019,338.74,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,21882.0,1.0,1.0,1.0,1.0,11.0,17644.0,90.0,1.0,4.0,2193.0,61.0,16200.0,0.0,0.0,5.0,5.0,3647.0,5493.0,31.3,0.0,0.0,108.0,26.0,4.0,4.0,0.0,14.0,,4.0,,0.0,2.0,3.0,3.0,3.0,14.0,5.0,5.0,3.0,6.0,0.0,0.0,0.0,2.0,100.0,33.3,0.0,0.0,35828.0,21882.0,8000.0,19628.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
16,,,23000,23000,23000.0,60 months,19.03,597.02,D,D3,Installation Manager,10+ years,MORTGAGE,75000.0,Source Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,585xx,ND,27.64,0.0,Dec-2002,1.0,43.0,,14.0,0.0,21795,56.6,22.0,w,19390.68,19390.68,8309.65,8309.65,3609.32,4700.33,0.0,0.0,0.0,Feb-2019,597.02,Mar-2019,Feb-2019,0.0,50.0,1,Individual,,,,0.0,0.0,165604.0,0.0,4.0,0.0,2.0,13.0,35227.0,56.0,1.0,2.0,7087.0,56.0,38500.0,2.0,3.0,1.0,4.0,12739.0,16705.0,56.6,0.0,0.0,179.0,157.0,11.0,11.0,4.0,11.0,,0.0,50.0,1.0,6.0,6.0,8.0,8.0,8.0,9.0,10.0,6.0,14.0,0.0,0.0,0.0,1.0,90.9,12.5,0.0,0.0,224502.0,57022.0,38500.0,63302.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
18,,,12000,12000,12000.0,60 months,12.62,270.71,C,C1,commercial sales,10+ years,OWN,40000.0,Source Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,394xx,MS,7.74,0.0,Nov-1986,0.0,,,3.0,0.0,5742,13.9,10.0,w,9832.57,9832.57,3773.11,3773.11,2167.43,1605.68,0.0,0.0,0.0,Feb-2019,270.71,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,383.0,5742.0,0.0,0.0,0.0,1.0,24.0,0.0,,0.0,0.0,5581.0,14.0,41300.0,0.0,0.0,0.0,1.0,1914.0,35558.0,13.9,0.0,0.0,137.0,372.0,52.0,24.0,0.0,52.0,,,,0.0,2.0,2.0,3.0,7.0,2.0,3.0,8.0,2.0,3.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,41300.0,5742.0,41300.0,0.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [38]:
# 컬럼 단위 샘플 접근
df['emp_title'] # << 딕셔너리 구조

# df[텍스트형태의 컬럼명]
# 인덱싱이나 슬라이싱으로 데이터에 접근을 할 때 큰 단위를 선택하고 그 결과에서 인덱싱 혹은 슬라이싱을 하면
# 조금 더 편하게, 쉽게 데이터 접근이 가능하다.

0                       mechanic
1                            NaN
2                   Truck driver
3         Confidential Secretary
4                General Manager
                  ...           
19995                    Partner
19996    Senior Property Manager
19997          Mechanic engineer
19998                    Teacher
19999         Director of Design
Name: emp_title, Length: 20000, dtype: object

In [39]:
# 여러 컬럼 동시 접근
df[['grade','sub_grade']]

Unnamed: 0,grade,sub_grade
0,B,B1
1,B,B3
2,C,C2
3,B,B4
4,C,C2
...,...,...
19995,B,B1
19996,C,C3
19997,C,C1
19998,D,D3


In [41]:
# row와 columns을 동시에 슬라이싱 하는 속성
# df.loc[인덱스, 컬럼명]
# 10~20까지 인덱스 접근

df.loc[10:20, ['grade','sub_grade']]

Unnamed: 0,grade,sub_grade
10,C,C5
11,C,C2
12,D,D1
13,B,B1
14,B,B5
15,C,C1
16,D,D3
17,D,D5
18,C,C1
19,D,D2


In [45]:
# df의 컬럼명을 순환하면서 컬럼단위로 접근하고 각 컬럼의 고유값을 출력해주는 코드
for col in df.columns:
    print(col,":",df[col].unique())

id : [nan]
member_id : [nan]
loan_amnt : [10000  3500  5000 14000 36000 20000  3200 14500 35000 40000 18000  6025
 10200 28000 23000 12000  1500 14400 25000 15500 11200  7200 21600 15600
 30000  6500  8000 22000  2000  3000  6000 13000  7800  5600 16000  4000
 10800 15000 13125  4150 32000  9000 11000  3600  8400  5500  3400  3800
 24000  6525 16800  1000 17000 26500 19000  6400 19750 21800 27000  9500
 17525 11500  7000  3025  9300  3300 33000  1400 21000  5400 26000 25525
  1300  9600  2500 10500  4800  1600 17500 19500  2200  7500  6600 19200
 11150  5300 28550 32900 11525  4500  7600 15300 34000  4400 37800 10550
 20800 24625 22725  6650 28150  2550 18500  6200 17600 13200  4600 24475
 18025 26450 27725  7625 12800 24875 16500  3550  4300 28850 15225  2975
 13675 16400 13600  5100 11075 14200 28675  2150  5375  2525  2400  8300
  6300 18400 16750 27200 38000  9800 11450 13925 21500 21200 15200 23800
 15575 20125 15450 18150  2800 22500 17450 30100 10600 23300 21250 13150
 28800  85

In [46]:
# 고윳값 갯수 출력
for col in df.columns:
    print(col,":",df[col].nunique())

id : 0
member_id : 0
loan_amnt : 921
funded_amnt : 921
funded_amnt_inv : 1020
term : 2
int_rate : 37
installment : 3920
grade : 7
sub_grade : 34
emp_title : 9427
emp_length : 11
home_ownership : 3
annual_inc : 2343
verification_status : 3
issue_d : 2
loan_status : 6
pymnt_plan : 2
url : 0
desc : 0
purpose : 12
title : 12
zip_code : 842
addr_state : 50
dti : 4287
delinq_2yrs : 19
earliest_cr_line : 555
inq_last_6mths : 6
mths_since_last_delinq : 109
mths_since_last_record : 120
open_acc : 49
pub_rec : 11
revol_bal : 14565
revol_util : 1027
total_acc : 89
initial_list_status : 2
out_prncp : 7215
out_prncp_inv : 7637
total_pymnt : 13971
total_pymnt_inv : 14304
total_rec_prncp : 8462
total_rec_int : 14257
total_rec_late_fee : 342
recoveries : 340
collection_recovery_fee : 343
last_pymnt_d : 15
last_pymnt_amnt : 7926
next_pymnt_d : 1
last_credit_pull_d : 16
collections_12_mths_ex_med : 7
mths_since_last_major_derog : 121
policy_code : 1
application_type : 2
annual_inc_joint : 888
dti_joint 

### 팬시인덱싱
> **`bool`** 형태의 array를 조건을 전달하여 다차원 배열을 인덱싱하는 방법.  
조건식을 사용하여 분석에 필요한 데이터샘플을 추출하기 용이합니다.

In [None]:
# 신용등급이 A인 샘플의 emp_title 확인


In [None]:
# 대출금액평균


In [None]:
# 조건식 샘플링 emp_title 이 ceo인 샘플들


In [None]:
# 신용등급 A와 B인 샘플접근

# 조건식을 여러개 써야 한다면 조건마다 ()로 감싸주시는 것이 좋습니다.

In [None]:
# df loan_amnt 컬럼값이 10000이상인 채권샘플의 grade


In [None]:
# 신용등급 A와 B인 샘플접근


In [None]:
# df loan_amnt 컬럼값이 10000 이상인 채권샘플의 grade


In [None]:
# df grade C 와 D 인 채권샘플 annual_inc 최대값인 인덱스 빼오기 (idxmax)
# 최대값 인덱스 빼와서 샘플까지 출력


In [None]:
# 컬럼 내 문자열 내에 우리가 찾고싶은 문자열이 포함되어 있는지를 기준으로 샘플링


## 데이터프레임 병합
> 실제 분석업무를 진행하다보면 데이터가 여기저기 분산되어 있을 경우가 더 많습니다.  
조각난 데이터를 분석에 필요한 데이터셋으로 만들기 위해 데이터프레임 병합을 많이 사용합니다.  
한개 이상의 데이터프레임을 병합 할 때 주로 사용하는 함수 2가지를 알아보겠습니다.    

### 데이터 병합에 사용가능한 key(병합할 기준이 되는 행 or 열)값이 있는경우
**`pd.merge`**(베이스데이터프레임, 병합할데이터프레임)  
> 사용 가능 한 파라메터
- `how` : 'left', 'right', 'inner', 'outer'
- `left_on` : key값이 다를 경우 베이스데이터프레임의 key 설정
- `right_on` : key값이 다를 경우 병합데이터프레임의 key 설정
    
### 단순 데이터 연결
**`pd.concat`**([베이스데이터프레임, 병합할데이터프레임], axis=0 or 1)
> 사용 가능 한 파라메터  
- `axis` : 축 방향 설정

### merge 예시

In [None]:
merge_df1 = pd.DataFrame({
    '이름': ['원영', '사쿠라', '유리', '예나', '유진', '나코', '은비', '혜원', '히토미', '채원', '민주', '째욘'],
    '국어': [100, 70, 70, 70, 60, 90, 90, 70, 70, 80, 100, 100],
    '영어': [100, 90, 80, 50, 70, 100, 70, 90, 100, 100, 80, 100]
    }, columns=['이름', '국어', '영어']) 

merge_df2 = pd.DataFrame({
    '일어': [80, 100, 100, 90, 70, 50, 100],
    '수학': [90, 70, 100, 80, 70, 80, 90],
    'name': ['원영', '사쿠라', '나코', '히토미', '예나', '은비', '째욘'],
    }, columns=['일어', '수학', 'name'])

In [None]:
# 데이터프레임 확인


In [None]:
# 병합 테스트


### concat 예시
현재 df에 저장되어있는 데이터에 추가로 2만개의 데이터를 이어붙여보겠습니다. df1이라는 변수에 이어붙일 데이터를 불러들여 병합을 진행해보겠습니다.

In [None]:
# df1 변수에 loan2.csv 파일을 읽어들입니다.


In [None]:
# 데이터프레임 확인


In [None]:
# df 와 df1 shape 확인


In [None]:
# 데이터프레임 행단위 병합


In [None]:
# 병합 데이터프레임 shape 확인


In [None]:
# 병합 데이터프레임 index 확인


## 인덱스 편집
방금 전 concat으로 병합한 데이터프레임의 이상한 점을 찾으셨나요?  
데이터 자체는 잘 붙였지만 인덱스가 꼬여있습니다. 인덱스 편집은 데이터분석을 위해 필요한 인덱스를 설정하기 위해 필요합니다.

In [None]:
# 인덱스리셋


In [None]:
# 기존 컬럼값을 취해 index로 사용


## 컬럼편집
인덱스편집과 마찬가지로 데이터프레임의 컬럼을 변경해야 할 경우도 있습니다. 데이터프레임은 컬럼단위 샘플링 및 인덱싱, 이름변경이 가능합니다.

### 컬럼선택

In [None]:
# df 컬럼명 접근


In [None]:
# columns 속성도 인덱싱 및 슬라이싱이 가능합니다.


저는 개인정보에 관한 부분에 관심이 많습니다. 데이터셋 중 필요한 부분만을 컬럼단위로 추려보겠습니다.

In [None]:
# df의 개인정보에 관한 컬럼만을 색인으로 df를 슬라이싱하고 person_df 변수에 할당


### 컬럼삭제
현재 데이터셋에는 개인식별정보가 지워져서 데이터가 존재하지 않습니다. 불필요한 데이터 column을 지우도록 하겠습니다.

In [None]:
# 지울 column의 데이터값이 모두 NaN인지 확인


삭제할 컬럼 모두 데이터가 없는 것을 확인했습니다.

In [None]:
# 컬럼 삭제 (drop, del, pop)




### 컬럼명 변경
    경우에 따라서는 데이터셋 제작 중 컬럼명을 변경해야 할 경우도 있습니다.
    국내 수집 데이터 사용 시 컬럼이 한글일 경우 영어로 변경을 많이 합니다.

In [None]:
# home_ownership을 간략하게 home으로 변경
# 한글도 가능합니다만 권장하지는 않습니다.


## 데이터 샘플링 및 분석
> 데이터병합, 인덱스편집, 컬럼선택만으로도 불필요한 정보를 삭제하고 새롭게 데이터셋을 만들 수 있는것을 확인했습니다.  
위에 학습한 내용도 데이터 샘플링에 속한 내용이지만 지금부터는 데이터셋의 데이터를 살펴보면서 의미있는 데이터를 추려보도록 하겠습니다.  
    
**데이터프레임의 기본적인 인덱싱, 슬라이싱, 조건부 샘플링을 조합하면 데이터의 샘플을 확인 하는 과정만으로도 데이터분석이 가능해집니다.**

In [None]:
# 분석에 필요한 데이터프레임을 만들었으니 원본값을 사용하겠습니다. 기존 df에 person_df 값을 덮어 씌웁니다.


In [None]:
# 분석에 필요한 데이터셋을 생성했다면 파일로도 저장 해둡시다.


### 저는 채권자의 개인정보에 관심이 많습니다. 고객의 직업을 살펴보겠습니다.

In [None]:
# emp_title 접근


In [None]:
# 값을 카운트 하는 함수 value_counts()


### 데이터프레임 형변환

In [None]:
# Owner, owner 같은 직업이지만 대소문자 구분에 따라 다른 값으로 취급되는 문제가 있네요.
# 대소문자 구분을 없애기 위해 모두 소문자로 데이터값을 변경하겠습니다.
# 소문자 변환 전 혹시모를 int, float 데이터가 있을지 모를 상황에 대비해서 모두 문자열로 변경해주겠습니다.
# 형변환 함수 astype(데이터타입)


In [None]:
%%time
# 반복문을 사용한 데이터 변경도 가능
# 하지만 파이썬의 강점을 살리지 못한 코드


### 배운사람들의 코드, 고오급 python 스킬
numpy를 학습하면서 브로드캐스팅에 관하여 잠깐 언급했었습니다. 그렇다면 그 파워풀하다던 브로드캐스팅은 어떻게 사용해야할까요?
    
>기타 언어에서는 지원하지 않는 기능이니만큼 파이썬의 특징을 가장 잘 살리는 코드  
**`apply`** 함수를 사용하여 인자로 받는 모든 데이터에 함수를 적용

#### apply 함수로 컬럼에 적용시키는 코드 구조
    df['컬럼명'] = df['컬럼명'].apply(lambda x: func(x) if 조건문)
    df['컬럼명'] = df['컬럼명'].apply(func_nm)

In [None]:
# 대문자 만드는 함수


In [None]:
# apply() 함수사용 반복이 가능한 데이터구조의 모든 인자에 적용
# lambda 각 인자에 적용할 함수 혹은 연산


In [None]:
# 대소문자 구분을 처리한 값 확인

# 기존 value_count 값과 차이가 있음을 확인 할 수 있습니다.
# 제공 된 데이터셋이라도 이와 같은 작은 차이가 있을 수 있습니다.
# 데이터를 꼼꼼하게 살펴볼 수록 디테일한 차이를 만들 수 있습니다.

In [None]:
# owner인 사람들 샘플링


In [None]:
# 샘플링 된 데이터프레임의 단일 컬럼 접근


In [None]:
# 컬럼 평균값 계산


In [None]:
# 코드 하나 변경으로 간단한 분석 가능
# owner가 아닌 사람들의 평균


## 데이터 재구조화

In [None]:
# 각 직업별 평균연봉이 궁금하다 groupby
# 엑셀의 pivol table 과 비슷한 기능


In [None]:
# pivot_table


## 결측치 처리
> 데이터 분석을 위해서는 데이터셋 내에 빈 값이 있는 경우 분석에 방해가 될 수 있는 여지가 많습니다.  
모든 결측치를 없애야 하는 것은 아니지만 되도록이면 결측치를 채우는 방법, 혹은 없애는 방법등으로 결측치를 처리합니다.  
몇가지 예시를 살펴보면서 결측치 처리에 대해 알아봅시다.

In [None]:
# info() 함수는 결측치에 대한 정보도 보여줍니다.
# 컬럼별 isnull() 함수를 사용해도 무방합니다.


    확인결과 emp_title, emp_length, dti에 결측치가 존재합니다.
    해당 컬럼의 결측치 샘플들을 살펴보고 결측치를 처리해 보겠습니다.

In [None]:
# 컬럼별 결측치 확인을 위한 isnull()함수 리턴값이 bool 형태로 반환되어 조건부 샘플링이 가능합니다.


In [None]:
# dti 컬럼에 결측치가 존재하는 샘플 확인


    직업과 근속연수에 관한 부분은 데이터를 통한 유추나 계산값을 통해 채워넣을 수 있는 항목은 아닌 것 같습니다.
    다만 dti의 경우 실수로 채워져 있는 부분이니 수업을 위해 평균값 혹은 근사치를 계산하여 채워보도록 하겠습니다.

### 결측치 채우기

In [None]:
# dti 컬럼의 NaN값 index 확인


In [None]:
# fillna() 함수로 NaN 값을 dti 컬럼의 평균으로 채우기

# fillna() 함수의 다양한 채우기 방법 파라메터 확인해보기


### 결측치 제거

In [None]:
# emp_title 결측치가 있는 샘플 확인


In [None]:
# view값으로 dropna 결과값 확인


In [None]:
# 결측치 제거
