## 한글 폰트 설치

In [1]:
### 한글 폰트 설치
!apt-get install -y fonts-nanum
!fc-cache -fv
!rm ~/.cache/matplotlib -rf
# 설치 후 colab의 경우 Runtime > Restart session 필요

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 12 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/

## matplotlib 시각화

In [2]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
mpl.rc('font', family='NanumBarunGothic') # 혹은 다른 설치한 Nanum 폰트 사용

## load dataset

In [3]:
import pandas as pd
df_RCOCT = pd.read_csv("/content/RentalCarOfContractType.csv")
df_RCOCT[:2]

Unnamed: 0,id,type_of_contract,type_of_contract2,channel,datetime,Term,payment_type,product,amount,state,overdue_count,overdue,credit rating,bank,cancellation,age,Mileage
0,66758234,렌탈,Normal,서비스 방문,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,9.0,새마을금고,정상,43.0,1862.0
1,66755948,렌탈,Extension_Rental,서비스 방문,2019-10-20,60,카드이체,K1,102900,계약확정,0,없음,2.0,현대카드,정상,62.0,2532.0


## 범주형 / 연속형 : 범주형 따른 관계 확인


### 연속형이 비정규 분포이며, 범주형 항목이 2개일 때

#### 예제 가설 : amount, channel(항목 2개 서비스 방문과 홈쇼핑 선택)

#### 분석 내용 : Channel 중 서비스 방문과 홈쇼핑 선택에 따른 월 렌탈 비용 평균이 차이가 있는가?
- 귀무 가설 : 비용 평균에 차이가 없다.
- 대립 가설 : 비용 평균에 차이가 있다.

### 정규성 검증 필요

### Wilcoxon Test

In [8]:
df_RCOCT_channel_two = df_RCOCT['channel'].value_counts().iloc[:2]
df_RCOCT_channel_two

서비스 방문    15186
홈쇼핑/방송    12977
Name: channel, dtype: int64

In [30]:
conditions = 'channel in ("서비스 방문","홈쇼핑/방송")'
# df_RCOCT.query(conditions)['channel'].value_counts()
df_RCOCT_channel_amount = df_RCOCT.query(conditions)
df_RCOCT_channel_amount[:2]

Unnamed: 0,id,type_of_contract,type_of_contract2,channel,datetime,Term,payment_type,product,amount,state,overdue_count,overdue,credit rating,bank,cancellation,age,Mileage
0,66758234,렌탈,Normal,서비스 방문,2019-10-20,60,CMS,K1,96900,계약확정,0,없음,9.0,새마을금고,정상,43.0,1862.0
1,66755948,렌탈,Extension_Rental,서비스 방문,2019-10-20,60,카드이체,K1,102900,계약확정,0,없음,2.0,현대카드,정상,62.0,2532.0


In [28]:
condition = 'channel == "서비스 방문"'
# df_RCOCT.query(condition)['amount'].sum()  # 서비스 방문에 대한 amount 합계
series_service_amount = df_RCOCT.query(condition)['amount']
series_service_amount


0         96900
1        102900
5         90900
8        102900
9        105900
          ...  
51231    105900
51234    105900
51247     96900
51282     96900
51288    102900
Name: amount, Length: 15186, dtype: int64

In [31]:
condition = 'channel == "홈쇼핑/방송"'
# df_RCOCT.query(condition)['amount'].sum()  # 서비스 방문에 대한 amount 합계
series_homeshopping_amount = df_RCOCT.query(condition)['amount']
series_homeshopping_amount


2         96900
6         98400
7         80400
10        81900
11        81900
          ...  
51272     96900
51292     96900
51296     96900
51297     96900
51301    120900
Name: amount, Length: 12977, dtype: int64

In [32]:
import scipy.stats as stats

In [34]:
# Wilcoxon Test
stats.ranksums(series_service_amount,series_homeshopping_amount)

RanksumsResult(statistic=74.731078009603, pvalue=0.0)

#### 분석결과
- 통계 결론 : 대립가설 성립
- 사실 결론 : Channel 중 서비스 방문과 홈쇼핑 선택에 따른 비용 평균에 차이가 있다.

In [35]:
# 필요 항목 확인
channel_list = df_RCOCT['channel'].value_counts().iloc[:3].index.to_list()
channel_list

['서비스 방문', '홈쇼핑/방송', '렌탈총판']

In [36]:
condition = "channel == '서비스 방문'"
df_RCOCT.query(condition)['amount'].iloc[:5] # 서비스 방문 항목에 amount 컬럼 선택

0     96900
1    102900
5     90900
8    102900
9    105900
Name: amount, dtype: int64

In [44]:
series_list = list()
for channel in channel_list:
  condition = f"channel == '{channel}'"
  series_amount = df_RCOCT.query(condition)['amount'] # condition에 따른 amount series 작성
  series_list.append(series_amount)
series_list[1].iloc[:5]

2     96900
6     98400
7     80400
10    81900
11    81900
Name: amount, dtype: int64

In [47]:
stats.kruskal(*series_list)
# stats.kruskal(series_list[0],series_list[1],series_list[2])


KruskalResult(statistic=8232.430933915304, pvalue=0.0)