In [1]:
import pandas as pd
import numpy as np

churn_df = pd.read_csv('churn.csv')
col_names = churn_df.columns.tolist()

print ("Column names:")
print (col_names)

print ("\nSample data:")
churn_df.head(6)

Column names:
['State', 'Account Length', 'Area Code', 'Phone', "Int'l Plan", 'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge', 'CustServ Calls', 'Churn?']

Sample data:


Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.
5,AL,118,510,391-8027,yes,no,0,223.4,98,37.98,...,101,18.75,203.9,118,9.18,6.3,6,1.7,0,False.


In [2]:
churn_result = churn_df['Churn?']
y = np.where(churn_result == 'True.',1,0)

# 去掉可能对结果无贡献的列
to_drop = ['State','Area Code','Phone','Churn?']
churn_feat_space = churn_df.drop(to_drop,axis=1)

# 'yes'/'no' 转换成布尔值
# numpy会把布尔值识别为0和1
yes_no_cols = ["Int'l Plan","VMail Plan"]
churn_feat_space[yes_no_cols] = churn_feat_space[yes_no_cols] == 'yes'
churn_feat_space[yes_no_cols].head()

Unnamed: 0,Int'l Plan,VMail Plan
0,False,True
1,False,True
2,False,False
3,True,False
4,True,False


In [3]:
# 取出特征
features = churn_feat_space.columns

X = churn_feat_space

# 标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

print ("Feature space holds %d observations and %d features" % X.shape)
print ("Unique target labels:", np.unique(y))
print (X)
print (len(y[y == 0]))

Feature space holds 3333 observations and 17 features
Unique target labels: [0 1]
[[ 0.67648946 -0.32758048  1.6170861  ... -0.60119509 -0.0856905
  -0.42793202]
 [ 0.14906505 -0.32758048  1.6170861  ... -0.60119509  1.2411686
  -0.42793202]
 [ 0.9025285  -0.32758048 -0.61839626 ...  0.21153386  0.69715637
  -1.1882185 ]
 ...
 [-1.83505538 -0.32758048 -0.61839626 ...  0.61789834  1.3871231
   0.33235445]
 [ 2.08295458  3.05268496 -0.61839626 ...  2.24335625 -1.87695028
   0.33235445]
 [-0.67974475 -0.32758048  1.6170861  ... -0.19483061  1.2411686
  -1.1882185 ]]
2850


In [4]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.cross_validation import KFold

# 构造交叉验证函数，建模，返回预测结果
def run_cv(X,y,clf_class,**kwargs):
    # 5折交叉验证,kf是0到len(y)的索引
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()
    #print("y_pred:",y_pred)
    # 通过交叉验证选择模型及参数
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # 初始化分类器及其参数
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    #print(y_pred)
    return y_pred

In [5]:
kf = KFold(len(y),n_folds=5,shuffle=True)
len(list(kf)) # kf实际上也是一个可迭代对象，可迭代5次

5

In [6]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN

def accuracy(y_true,y_pred):
    return np.mean(y_true == y_pred)

print ("Support vector machines:")

print ("%.3f" % accuracy(y, run_cv(X,y,SVC,kernel=
                                  'rbf')))
print ("Random forest:")
print ("%.3f" % accuracy(y, run_cv(X,y,RF)))
print ("K-nearest-neighbors:")
print ("%.3f" % accuracy(y, run_cv(X,y,KNN)))

Support vector machines:
0.919
Random forest:
0.942
K-nearest-neighbors:
0.893


In [7]:
for gama in [0.001,0.01,0.1]:
    print ("%.3f" % accuracy(y, run_cv(X,y,SVC,kernel=
                                  'rbf',gamma=gama)))

0.855
0.882
0.913


In [9]:
# 构造交叉验证函数，建模，返回预测结果（概率形式）
def run_prob_cv(X, y, clf_class, **kwargs):
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # 预测为概率
        y_prob[test_index] = clf.predict_proba(X_test)
    return y_prob

In [11]:
import warnings
warnings.filterwarnings('ignore')

# 用10棵树的随机森林
pred_prob = run_prob_cv(X, y, RF, n_estimators=10)
pred_prob # 0为不流失，1为流失,预测的是不流失的概率与流失的概率

array([[1. , 0. ],
       [0.8, 0.2],
       [1. , 0. ],
       ...,
       [1. , 0. ],
       [0.9, 0.1],
       [0.9, 0.1]])

In [12]:
# 预测为流失的概率
pred_churn = pred_prob[:,1]

# 实际流失的为True，没流失为False
is_churn = y == 1

# Number of times a predicted probability is assigned to an observation
counts = pd.value_counts(pred_churn)
counts.sort_index()

0.0    1696
0.1     736
0.2     291
0.3     129
0.4      75
0.5      67
0.6      59
0.7      72
0.8      90
0.9      68
1.0      50
dtype: int64

In [11]:
is_churn.shape

(3333,)

In [12]:
pred_churn.shape

(3333,)

In [13]:
(is_churn[pred_churn == 0.8]).shape

(79,)

In [14]:
(is_churn[pred_churn == 0.8]).sum()

76

In [13]:
true_prob = {}
for prob in counts.index:
    # 统计预测概率为prob时的总人数中，实际流失的人数所占的比例
    true_prob[prob] = np.mean(is_churn[pred_churn == prob])
    print(true_prob[prob])
true_prob = pd.Series(true_prob)
true_prob

0.031839622641509434
0.016304347826086956
0.054982817869415807
0.16279069767441862
0.9555555555555556
0.3333333333333333
0.8333333333333334
1.0
0.6865671641791045
0.7627118644067796
1.0


0.0    0.031840
0.1    0.016304
0.2    0.054983
0.3    0.162791
0.8    0.955556
0.4    0.333333
0.7    0.833333
0.9    1.000000
0.5    0.686567
0.6    0.762712
1.0    1.000000
dtype: float64

In [14]:
counts = pd.concat([counts,true_prob], axis=1).reset_index()
counts.columns = ['pred_prob', 'count', 'true_prob']
counts

Unnamed: 0,pred_prob,count,true_prob
0,0.0,1696,0.03184
1,0.1,736,0.016304
2,0.2,291,0.054983
3,0.3,129,0.162791
4,0.8,90,0.955556
5,0.4,75,0.333333
6,0.7,72,0.833333
7,0.9,68,1.0
8,0.5,67,0.686567
9,0.6,59,0.762712


以上结果显示预测概率为0.7时的所有人数中，实际流失的比例占比达到89%
所以可设定阈值为0.7，那么预测的流失概率为0.7及以上的用户要重点关注  
  
  这是一种评估尺度和混淆矩阵评估尺度不一样，因为在这个场景中关注的不再是recall值，也就是说那些流失的用户没有尽可能多的给揪出来也没关系，  
  这里关注的重点是，我们揪出来的客户（即要关注的客户）是流失客户的概率越大越好，以最小的成本揪出来尽可能多的流失客户就Ok
  
这里的阈值0.7和混淆矩阵里面的阈值是完全不一样，混淆矩阵里面的阈值是决定预测结果的，这个阈值肯定不是预测结果的一个阈值，它是我们做出决策的一个参考