# 군집별 loan_result에 대한 분석
loan_result정보에 군집 라벨을 붙여 군집별 은행상품정보에 대해 분석함(은행 id별, 은행 상품별)

In [1]:
import numpy as np
import pandas as pd
import gzip
import pickle
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns
%matplotlib inline

# unicode minus를 사용하지 않기 위한 설정 (minus 깨짐현상 방지)
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'NanumGothic'

In [2]:
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [9]:
#데이터 읽어오기
import joblib

with open('../processed/loan_info.pickle', 'rb') as f:
    df = joblib.load(f)

In [None]:
cluster = pd.read_csv('../processed/5_4_cluster_except.csv')

In [None]:
df.head()

In [None]:
cluster.head()

In [None]:
app_id = cluster['application_id'].unique()

In [None]:
df_selected = df[df['application_id'].isin(app_id)]

In [None]:
df_selected

In [None]:
len(app_id)

In [None]:
df_selected_label = pd.merge(df_selected,cluster[['application_id','label']], on='application_id', how='left')

In [None]:
df_selected_label.info()

In [22]:
df_selected_label.to_csv('../preprocessed/5_5_cluster_loan.csv')

## 군집별 대출상품 분석
데이터(5_4_cluster_except.csv)의 application_id에 매핑된 loan_result 정보를 뽑고 그 옆에 컬럼으로 라벨 붙이기<br>
각 칼럼별 대푯값 뽑기<br>
- loan_limit, loan_rate 돈 등은 평균, median정보
- bank_id는 최빈값 정보
- product_id는 분포

In [6]:
loan = pd.read_csv('../processed/5_5_cluster_loan.csv', index_col = 0)

In [None]:
loan.head()

In [None]:
loan[loan['is_applied'] == 0]

In [8]:
loan0 = loan[loan['label'] == 0]
loan1 = loan[loan['label'] == 1]
loan2 = loan[loan['label'] == 2]
loan3 = loan[loan['label'] == 3]
loan4 = loan[loan['label'] == 4]

## 1. loan_result 기본정보에 대한 분석

### 1-1) loan_limit, loan_rate

In [116]:
### 이상치 제거 ###
#100억이 한 명이기 때문에 이상치로, 분석에서 제외함
loan = loan[loan['loan_limit']<10000000000]

In [None]:
# user_spec에 대한 대푯값 뽑기
pd.options.display.float_format = '{:.3f}'.format
grouped_loan= loan.groupby(['label'])[['loan_limit','loan_rate', 'loan_limit_per_desired_amount']]
grouped_loan2 = grouped_loan.agg(['mean', 'median', 'max', 'min']).reset_index()
grouped_loan2

### 1-2) bank_id

In [14]:
#bank_id
#군집별 대출조회한 은행 수
bloan0 = loan0.groupby('bank_id')['application_id'].count().reset_index()
bloan1 = loan1.groupby('bank_id')['application_id'].count().reset_index()
bloan2 = loan2.groupby('bank_id')['application_id'].count().reset_index()
bloan3 = loan3.groupby('bank_id')['application_id'].count().reset_index()
bloan4 = loan4.groupby('bank_id')['application_id'].count().reset_index()

In [None]:
print('loan0: ', len(loan0['bank_id'].unique()))

In [None]:
print('loan1: ', len(loan1['bank_id'].unique()))

In [None]:
print('loan2: ', len(loan2['bank_id'].unique()))

In [None]:
print('loan3: ', len(loan3['bank_id'].unique()))

In [None]:
print('loan4: ', len(loan4['bank_id'].unique()))

In [None]:
bloan0

In [None]:
bank_loan_all = loan.groupby('label')['bank_id'].apply(lambda x:x.value_counts(normalize=True).sort_values(ascending=False).head(15))

## 2. 군집별 상품추천 은행비율
각 군집별 추천된 은행의 비율

In [17]:
bank_loan = loan.groupby('label')['bank_id'].apply(lambda x:x.value_counts(normalize=True).sort_values(ascending=False).head(15))

In [None]:
bank_loan

In [None]:
bank_loan_applied

In [None]:
bank_loan_applied / bank_loan

In [141]:
bank_loan_temp = pd.DataFrame(bank_loan).reset_index()
bank_loan_temp['bank_id_ratio'] = bank_loan_temp['label'].astype(str)+' / '+ bank_loan_temp['level_1'].astype(str) + ' / ' + bank_loan_temp['bank_id'].astype(str)

In [None]:
pd.DataFrame(bank_loan_temp['bank_id_ratio'].T.values.reshape(15,-1,order='F'),
             columns=[0,1,2,3,4])

#### <b> 군집별 is applied된 상품추천 은행비율</b>
is_applied된 것만 따로 봄

In [22]:
loan_applied = loan[loan['is_applied'] == 1]
bank_loan_applied_all = loan_applied.groupby('label')['bank_id'].apply(lambda x:x.value_counts(normalize=True))

In [None]:
bank_loan_applied_all

In [158]:
bank_loan_temp_applied = pd.DataFrame(bank_loan_applied).reset_index()
bank_loan_temp_applied['bank_id_ratio'] = bank_loan_temp_applied['label'].astype(str)+' / '+ bank_loan_temp_applied['level_1'].astype(str) + ' / ' + bank_loan_temp_applied['bank_id'].astype(str)

In [None]:
pd.DataFrame(bank_loan_temp_applied['bank_id_ratio'].T.values.reshape(15,-1,order='F'),
             columns=[0,1,2,3,4])

### 2-1) 0번 군집에 대한 은행id별 cvr

In [None]:
#군집에 대한 은행id별 cvr
bank_cvr_dict={}
for bank in list(loan0['bank_id'].unique()):
    bank_cvr_dict[bank]=len(loan0[(loan0['bank_id'] == bank) & (loan0['is_applied'] == 1)]) / len(loan0[loan0['bank_id'] == bank])

In [None]:
pd.Series(bank_cvr_dict).sort_values(ascending = False)

In [None]:
sum(pd.Series(bank_cvr_dict).sort_values(ascending = False).iloc[:10])/10 #주요 10개 은행 추천비율

In [None]:
sum(bank_cvr_dict.values())/len(bank_cvr_dict.keys())

### 2-2) 1번 군집에 대한 은행id별 cvr

In [None]:
bank_cvr_dict={}
for bank in list(loan1['bank_id'].unique()):
    bank_cvr_dict[bank]=len(loan1[(loan1['bank_id'] == bank) & (loan1['is_applied'] == 1)]) / len(loan1[loan1['bank_id'] == bank])
pd.Series(bank_cvr_dict).sort_values(ascending = False)

In [None]:
sum(pd.Series(bank_cvr_dict1).sort_values(ascending = False).iloc[:10])/10 #주요 10개 은행 추천비율

In [None]:
sum(bank_cvr_dict1.values())/len(bank_cvr_dict1.keys())

### 2-3) 2번 군집에 대한 은행id별 cvr

In [None]:
bank_cvr_dict={}
for bank in list(loan0['bank_id'].unique()):
    bank_cvr_dict[bank]=len(loan2[(loan2['bank_id'] == bank) & (loan2['is_applied'] == 1)]) / len(loan2[loan2['bank_id'] == bank])
pd.Series(bank_cvr_dict).sort_values(ascending = False)

In [63]:
bank_cvr_dict2={}
for bank in list(loan0['bank_id'].unique()):
    bank_cvr_dict2[bank]=len(loan2[(loan2['bank_id'] == bank) & (loan2['is_applied'] == 1)]) / len(loan2[loan2['bank_id'] == bank])

In [None]:
sum(pd.Series(bank_cvr_dict2).sort_values(ascending = False).iloc[:10])/10

In [None]:
sum(bank_cvr_dict2.values())/len(bank_cvr_dict2.keys())

### 2-4) 3번 군집에 대한 은행id별 cvr

In [None]:
bank_cvr_dict={}
for bank in list(loan0['bank_id'].unique()):
    bank_cvr_dict[bank]=len(loan3[(loan3['bank_id'] == bank) & (loan3['is_applied'] == 1)]) / len(loan3[loan3['bank_id'] == bank])
pd.Series(bank_cvr_dict).sort_values(ascending = False)

In [67]:
bank_cvr_dict3={}
for bank in list(loan0['bank_id'].unique()):
    bank_cvr_dict3[bank]=len(loan3[(loan3['bank_id'] == bank) & (loan3['is_applied'] == 1)]) / len(loan3[loan3['bank_id'] == bank])

In [None]:
sum(pd.Series(bank_cvr_dict3).sort_values(ascending = False).iloc[:10])/10

### 2-5) 4번 군집에 대한 은행id별 cvr

In [None]:
bank_cvr_dict={}
for bank in list(loan0['bank_id'].unique()):
    bank_cvr_dict[bank]=len(loan4[(loan4['bank_id'] == bank) & (loan4['is_applied'] == 1)]) / len(loan4[loan4['bank_id'] == bank])
pd.Series(bank_cvr_dict).sort_values(ascending = False)

In [70]:
bank_cvr_dict4={}
for bank in list(loan0['bank_id'].unique()):
    bank_cvr_dict4[bank]=len(loan4[(loan4['bank_id'] == bank) & (loan4['is_applied'] == 1)]) / len(loan4[loan4['bank_id'] == bank])
#sum(bank_cvr_dict4.values())/len(bank_cvr_dict4.keys())

In [None]:
sum(pd.Series(bank_cvr_dict4).sort_values(ascending = False).iloc[:10])/10

In [None]:
bank_cvr_dict_all={}
for bank in list(loan['bank_id'].unique()):
    bank_cvr_dict_all[bank]=len(loan[(loan['bank_id'] == bank) & (loan['is_applied'] == 1)]) / len(loan[loan['bank_id'] == bank])
sum(bank_cvr_dict_all.values())/len(bank_cvr_dict_all.keys())

In [None]:
sum(pd.Series(bank_cvr_dict_all).sort_values(ascending = False).iloc[:10])/10

## 3. 은행id_은행상품별 cvr

### 3-1) 0번 군집에 대해 (bank_id, product_id)에 대한 cvr

In [203]:
loan0['bank_product_id']=loan0['bank_id'].astype(str)+'-'+loan0['product_id'].astype(str)
bank_product_cvr_dict={}
for p in list(loan0['bank_product_id'].unique()):
    bank_product_cvr_dict[p]=len(loan0[(loan0['bank_product_id'] == p) & (loan0['is_applied'] == 1)]) / len(loan0[loan0['bank_product_id'] == p])
loan0_bp=pd.Series(bank_product_cvr_dict).sort_values(ascending = False)

In [None]:
loan0_bp #은행-상품 cvr

In [205]:
loan0_bp=pd.concat([loan0_bp,loan0['bank_product_id'].value_counts()], axis=1)

In [None]:
loan0_bp.rename(columns={'bank_product_id':'count'}, inplace=True)
loan0_bp

In [None]:
loan0_bp_new = loan0_bp[loan0_bp['count']>5] # cvr과 실제 추천 수(5번 이상) 붙여줌
loan0_bp_new

In [None]:
loan0_bp_new['ratio'] = (loan0_bp_new['count'])/(loan0_bp_new['count'].sum())
loan0_bp_new[loan0_bp_new['ratio']>0.005][:30]

### 3-2) 1번 군집에 대해 (bank_id, product_id)에 대한 cvr

In [222]:
loan1['bank_product_id']=loan1['bank_id'].astype(str)+'-'+loan1['product_id'].astype(str)
bank_product_cvr_dict={}
for p in list(loan1['bank_product_id'].unique()):
    bank_product_cvr_dict[p]=len(loan1[(loan1['bank_product_id'] == p) & (loan1['is_applied'] == 1)]) / len(loan1[loan1['bank_product_id'] == p])
loan1_bp=pd.Series(bank_product_cvr_dict).sort_values(ascending = False)

In [223]:
loan1_bp=pd.concat([loan1_bp,loan1['bank_product_id'].value_counts()], axis=1)

In [None]:
loan1_bp.rename(columns={'bank_product_id':'count'}, inplace=True)
loan1_bp['ratio'] = (loan1_bp['count'])/(loan1_bp['count'].sum())
loan1_bp[loan1_bp['ratio']>0.005][:10]

### 3-3) 2번 군집에 대해 (bank_id, product_id)에 대한 cvr

In [228]:
loan2['bank_product_id']=loan2['bank_id'].astype(str)+'-'+loan2['product_id'].astype(str)
bank_product_cvr_dict={}
for p in list(loan2['bank_product_id'].unique()):
    bank_product_cvr_dict[p]=len(loan2[(loan2['bank_product_id'] == p) & (loan2['is_applied'] == 1)]) / len(loan2[loan2['bank_product_id'] == p])
loan2_bp=pd.Series(bank_product_cvr_dict).sort_values(ascending = False)

In [229]:
loan2_bp=pd.concat([loan2_bp,loan2['bank_product_id'].value_counts()], axis=1)

In [None]:
loan2_bp.rename(columns={'bank_product_id':'count'}, inplace=True)
loan2_bp['ratio'] = (loan2_bp['count'])/(loan2_bp['count'].sum())
loan2_bp[loan2_bp['ratio']>0.005][:10]

### 3-4) 3번 군집에 대해 (bank_id, product_id)에 대한 cvr

In [234]:
loan3['bank_product_id']=loan3['bank_id'].astype(str)+'-'+loan3['product_id'].astype(str)
bank_product_cvr_dict={}
for p in list(loan3['bank_product_id'].unique()):
    bank_product_cvr_dict[p]=len(loan3[(loan3['bank_product_id'] == p) & (loan3['is_applied'] == 1)]) / len(loan3[loan3['bank_product_id'] == p])
loan3_bp=pd.Series(bank_product_cvr_dict).sort_values(ascending = False)

In [235]:
loan3_bp=pd.concat([loan3_bp,loan3['bank_product_id'].value_counts()], axis=1)

In [None]:
loan3_bp.rename(columns={'bank_product_id':'count'}, inplace=True)
loan3_bp['ratio'] = (loan3_bp['count'])/(loan3_bp['count'].sum())
loan3_bp[loan3_bp['ratio']>0.005][:10]

### 3-5) 4번 군집에 대해 (bank_id, product_id)에 대한 cvr

In [237]:
loan4['bank_product_id']=loan4['bank_id'].astype(str)+'-'+loan4['product_id'].astype(str)
bank_product_cvr_dict={}
for p in list(loan4['bank_product_id'].unique()):
    bank_product_cvr_dict[p]=len(loan4[(loan4['bank_product_id'] == p) & (loan4['is_applied'] == 1)]) / len(loan4[loan4['bank_product_id'] == p])
loan4_bp=pd.Series(bank_product_cvr_dict).sort_values(ascending = False)

In [238]:
loan4_bp=pd.concat([loan4_bp,loan4['bank_product_id'].value_counts()], axis=1)

In [None]:
loan4_bp.rename(columns={'bank_product_id':'count'}, inplace=True)
loan4_bp['ratio'] = (loan4_bp['count'])/(loan4_bp['count'].sum())
loan4_bp[loan4_bp['ratio']>0.005][:10]

In [None]:
fig, ax=plt.subplots(nrows=5, ncols=1, figsize=(70,60))
sns.barplot(x = 'bank_id', y = 'application_id', data = bloan0, palette='viridis' , ax = ax[0])
sns.barplot(x = 'bank_id', y = 'application_id', data = bloan1, palette='viridis' , ax = ax[1])
sns.barplot(x = 'bank_id', y = 'application_id', data = bloan2, palette='viridis' , ax = ax[2])
sns.barplot(x = 'bank_id', y = 'application_id', data = bloan3, palette='viridis' , ax = ax[3])
sns.barplot(x = 'bank_id', y = 'application_id', data = bloan4, palette='viridis' , ax = ax[4])
#plt.yscale('log')
plt.rc('font', size=10)
plt.rc('xtick', labelsize=50)  # x축 눈금 폰트 크기 
plt.rc('ytick', labelsize=50)  # y축 눈금 폰트 크기
plt.show()

## 4. loan_limit_per_desired_amount에 대한 barplot

In [93]:
not_applied = pd.concat([loan2, loan3])
applied = pd.concat([loan0, loan1, loan4])

In [None]:
pd.options.display.float_format = '{:.3f}'.format
loan_0 = loan[loan['loan_limit_per_desired_amount'] == 0]
loan_1 = loan[loan['loan_limit_per_desired_amount'] == 1]
loan_2 = loan[loan['loan_limit_per_desired_amount'] == 10]
display(loan_0[['desired_amount', 'loan_limit', 'loan_limit_per_desired_amount']].head())
display(loan_1[['desired_amount', 'loan_limit', 'loan_limit_per_desired_amount']].head())
display(loan_2[['desired_amount', 'loan_limit', 'loan_limit_per_desired_amount']].head())