In [6]:
import numpy as np
import pandas as pd

In [7]:
churn = pd.read_csv('churn.csv', sep=',', header=0)

In [8]:
churn.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [9]:
churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]

In [10]:
# 예측 값을 Logit 데이터로 변환하기 위한 전처리 코드
churn['churn01'] = np.where(churn['churn'] == 'True.', 1., 0.)

In [11]:
churn.groupby(['churn01'])[['day_charge', 'eve_charge', 'night_charge', 'intl_charge', 'account_length', 'custserv_calls']].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,day_charge,day_charge,day_charge,eve_charge,eve_charge,eve_charge,night_charge,night_charge,night_charge,intl_charge,intl_charge,intl_charge,account_length,account_length,account_length,custserv_calls,custserv_calls,custserv_calls
Unnamed: 0_level_1,count,mean,std,count,mean,std,count,mean,std,count,mean,std,count,mean,std,count,mean,std
churn01,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
0.0,2850,29.780421,8.530835,2850,16.918909,4.274863,2850,9.006074,2.299768,2850,2.743404,0.751784,2850,100.793684,39.88235,2850,1.449825,1.163883
1.0,483,35.175921,11.72971,483,18.054969,4.396762,483,9.235528,2.121081,483,2.889545,0.754152,483,102.664596,39.46782,483,2.229814,1.853275


In [12]:
churn.groupby(['churn01']).agg({'day_charge' : ['mean', 'std'],
				'eve_charge' : ['mean', 'std'],
				'night_charge' : ['mean', 'std'],
				'intl_charge' : ['mean', 'std'],
				'account_length' : ['count', 'min', 'max'],
				'custserv_calls' : ['count', 'min', 'max']})

Unnamed: 0_level_0,day_charge,day_charge,eve_charge,eve_charge,night_charge,night_charge,intl_charge,intl_charge,account_length,account_length,account_length,custserv_calls,custserv_calls,custserv_calls
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,count,min,max,count,min,max
churn01,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
0.0,29.780421,8.530835,16.918909,4.274863,9.006074,2.299768,2.743404,0.751784,2850,1,243,2850,0,8
1.0,35.175921,11.72971,18.054969,4.396762,9.235528,2.121081,2.889545,0.754152,483,1,225,483,0,9


### 새로운 독립변수 추가 및 기술통계 항목 확인
* 기존의 독립변수로부터 새로운 독립변수를 추가하여 모델의 성능을 개선할 수 있는지 확인
* 적용 방법 (EDA방식으로 접근)
 - 새로운 독립변수에 대한 가설 설정
 - 해당 독립변수에 대한 AB 테스트
 - 모델 성능 개선이 있다면 해당 독립변수 활용

In [13]:
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
# cut(데이터 객체, 구간수, precision=)
# 데이터 객체, 구간수 필수 인자
# 구간수: 구간 균등 분할 (갯수에 따른 분할은 qcut)
# precision 소수점 이하 자릿수
factor_cut = pd.cut(churn.total_charges, 5, precision=2)
def get_stats(group):
	return {'min' : group.min(), 'max' : group.max(),
			'count' : group.count(), 'mean' : group.mean(),
			'std' : group.std()}
grouped = churn.custserv_calls.groupby(factor_cut)
print(grouped.apply(get_stats).unstack())

                min  max   count      mean       std
total_charges                                       
(22.86, 37.57]  0.0  5.0    70.0  1.528571  1.348337
(37.57, 52.22]  0.0  7.0   742.0  1.564690  1.305234
(52.22, 66.86]  0.0  9.0  1726.0  1.581692  1.326646
(66.86, 81.51]  0.0  9.0   735.0  1.523810  1.295209
(81.51, 96.15]  0.0  5.0    60.0  1.516667  1.359108


In [14]:
factor_qcut = pd.qcut(churn.account_length, [0., 0.25, 0.5, 0.75, 1.])
grouped = churn.custserv_calls.groupby(factor_qcut)
print(grouped.apply(get_stats).unstack())

                min  max  count      mean       std
account_length                                     
(0.999, 74.0]   0.0  9.0  857.0  1.506418  1.251268
(74.0, 101.0]   0.0  7.0  847.0  1.604486  1.359888
(101.0, 127.0]  0.0  8.0  803.0  1.652553  1.358479
(127.0, 243.0]  0.0  9.0  826.0  1.491525  1.286970
