In [1]:
import pandas as pd

# 读取数据
broadband = pd.read_csv('broadband_train.csv', encoding="ansi")

In [2]:
# 查看数据信息
broadband.head()

Unnamed: 0,CUST_ID,GENDER,AGE,TENURE,CHANNEL,AUTOPAY,ARPU_3M,CALL_PARTY_CNT,DAY_MOU,AFTERNOON_MOU,NIGHT_MOU,AVG_CALL_LENGTH,BROADBAND
0,63,男,34,27,2,否,203.0,0.0,0.0,0.0,0.0,3.04,1
1,64,,62,58,1,否,360.0,,0.0,1910.0,0.0,3.3,1
2,65,男,39,55,3,,304.0,0.0,437.2,200.3,,4.92,0
3,66,,39,55,3,否,304.0,0.0,437.2,182.8,0.0,4.92,0
4,67,男,39,55,3,否,,,,214.5,0.0,4.92,0


In [3]:
type(broadband)

pandas.core.frame.DataFrame

In [4]:
broadband.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 13 columns):
CUST_ID            999 non-null int64
GENDER             990 non-null object
AGE                999 non-null int64
TENURE             999 non-null int64
CHANNEL            999 non-null int64
AUTOPAY            993 non-null object
ARPU_3M            995 non-null float64
CALL_PARTY_CNT     993 non-null float64
DAY_MOU            994 non-null float64
AFTERNOON_MOU      999 non-null float64
NIGHT_MOU          991 non-null float64
AVG_CALL_LENGTH    999 non-null float64
BROADBAND          999 non-null int64
dtypes: float64(6), int64(5), object(2)
memory usage: 101.5+ KB


In [5]:
broadband.describe()

Unnamed: 0,CUST_ID,AGE,TENURE,CHANNEL,ARPU_3M,CALL_PARTY_CNT,DAY_MOU,AFTERNOON_MOU,NIGHT_MOU,AVG_CALL_LENGTH,BROADBAND
count,999.0,999.0,999.0,999.0,995.0,993.0,994.0,999.0,991.0,999.0,999.0
mean,554.638639,37.8999,26.29029,2.551552,228.249246,9.303122,84.92163,198.063664,89.851867,3.898418,0.184184
std,338.598231,11.046177,18.300208,1.168686,153.579943,11.241922,180.975014,464.657034,206.920991,0.765869,0.387828
min,1.0,18.0,1.0,1.0,68.0,0.0,0.0,0.0,0.0,2.2,0.0
25%,250.5,30.0,13.0,1.0,150.0,0.0,0.0,0.0,0.0,3.315,0.0
50%,560.0,36.0,19.0,3.0,189.0,5.0,0.0,26.6,0.0,3.83,0.0
75%,864.5,45.0,38.0,4.0,254.0,17.0,69.8,192.05,77.6,4.38,0.0
max,1114.0,76.0,72.0,4.0,2049.0,51.0,1162.4,4915.0,1502.1,6.83,1.0


In [6]:
# 数据清洗
# 缺失值处理，因为含有缺失值的行数不多，所以这里选择删除处理
broadband = broadband.dropna()
broadband.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 965 entries, 0 to 998
Data columns (total 13 columns):
CUST_ID            965 non-null int64
GENDER             965 non-null object
AGE                965 non-null int64
TENURE             965 non-null int64
CHANNEL            965 non-null int64
AUTOPAY            965 non-null object
ARPU_3M            965 non-null float64
CALL_PARTY_CNT     965 non-null float64
DAY_MOU            965 non-null float64
AFTERNOON_MOU      965 non-null float64
NIGHT_MOU          965 non-null float64
AVG_CALL_LENGTH    965 non-null float64
BROADBAND          965 non-null int64
dtypes: float64(6), int64(5), object(2)
memory usage: 105.5+ KB


In [7]:
# 将GENDER(性别)和AUTOPAY(自动充值)两列进行类型转换，用1、0代表男生、女生；1代表自动充值、0代表没有自动充值
gender = broadband.GENDER.copy()
for i in broadband.index.values:
    gender[i] = gender[i].replace("男","1")
    gender[i] = gender[i].replace("女","0")
autopay = broadband.AUTOPAY.copy()
for i in broadband.index.values:
    autopay[i] = autopay[i].replace("是","1")
    autopay[i] = autopay[i].replace("否","0")
broadband.GENDER = gender.copy()
broadband.AUTOPAY = autopay.copy()

In [8]:
# 按客户编码排序
broadband = broadband.sort_values(by='CUST_ID', ascending=True)

In [9]:
# 将行号按顺序
broadband = broadband.reset_index(drop=True)

In [10]:
# 目标是预测宽带营销响应 即BROADBAND列
# 首先将数据集划分为训练集与测试集
# 使用交叉验证法
# https://www.cnblogs.com/ysugyl/p/8707887.html
# https://blog.csdn.net/luanpeng825485697/article/details/79836262

broadband_data = broadband.loc[:,'CUST_ID':'AVG_CALL_LENGTH']

# 留出集验证
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(broadband_data, broadband.BROADBAND, test_size=0.2, random_state=0)

In [11]:
type(X_train)

pandas.core.frame.DataFrame

In [12]:
print(X_train.dtypes)

CUST_ID              int64
GENDER              object
AGE                  int64
TENURE               int64
CHANNEL              int64
AUTOPAY             object
ARPU_3M            float64
CALL_PARTY_CNT     float64
DAY_MOU            float64
AFTERNOON_MOU      float64
NIGHT_MOU          float64
AVG_CALL_LENGTH    float64
dtype: object


In [13]:
# 使用算法进行分类与验证
# 高斯朴素贝叶斯
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [14]:
y_pre = model.predict(X_test)

In [15]:
# 计算精确率 召回率 F1
from sklearn.metrics import precision_score, recall_score, f1_score

pre = precision_score(y_test, y_pre)
recall = recall_score(y_test, y_pre)
f1 = f1_score(y_test, y_pre)

print("高斯朴素贝叶斯分类\n精确率：%f，召回率：%f，F1：%f" % (pre, recall, f1))

高斯朴素贝叶斯分类
精确率：0.733333，召回率：0.628571，F1：0.676923


In [16]:
# 决策树分类
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(random_state=1)
tree_clf = tree_clf.fit(X_train, y_train)
print(tree_clf)
print(tree_clf.score(X_test, y_test))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')
0.9119170984455959


In [17]:
# feature_importances_属性能够查看各个特征对模型的重要性
# tree_clf.feature_importances_

In [18]:
# 计算精确率 召回率 F1
from sklearn.metrics import precision_score, recall_score, f1_score

y_pre = tree_clf.predict(X_test)
pre = precision_score(y_test, y_pre)
recall = recall_score(y_test, y_pre)
f1 = f1_score(y_test, y_pre)

print("决策树分类\n精确率：%f，召回率：%f，F1：%f" % (pre, recall, f1))

决策树分类
精确率：0.875000，召回率：0.600000，F1：0.711864


In [27]:
# 使用网格搜索GridSearchCV对决策树进行调参
from sklearn.model_selection import GridSearchCV
import warnings

warnings.filterwarnings('ignore')
param = {'criterion':['gini'],'max_depth':range(1,100,10)}
grid = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid=param, cv=10, scoring='f1')
grid.fit(X_train, y_train)
print('最优分类器:',grid.best_params_,'最优分数:', grid.best_score_)
print(grid.best_estimator_)

最优分类器: {'criterion': 'gini', 'max_depth': 21} 最优分数: 0.6161332612207038
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=21,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')


In [31]:
clf = DecisionTreeClassifier(max_depth=21, random_state=1, min_samples_leaf=1, min_samples_split=2)
clf.fit(X_train, y_train)
y_new = clf.predict(X_test)
f1_new = f1_score(y_test, y_new)

print("决策树分类\nF1：%f" % (f1_new))

决策树分类
F1：0.711864


In [21]:
# 随机森林
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param = {'n_estimators':range(1,101,10) ,'max_depth':range(1,101,10)}
grid = GridSearchCV(RandomForestClassifier(), param_grid=param, cv=10, scoring='f1')
grid.fit(X_train, y_train)
print(grid.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=31, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=41, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [22]:
model = RandomForestClassifier(n_estimators=200,max_depth=20, min_samples_leaf=1, min_samples_split=2,random_state=1)
model.fit(X_train, y_train)
y_pre = model.predict(X_test)
f1 = f1_score(y_test, y_pre)
f1

0.7017543859649122

In [110]:
# 支持向量机
from sklearn.svm import SVC
import numpy as np


c_range =  np.logspace(0,9,10)
gamma_range = np.logspace(-5,2,2)
param = {'kernel': ['rbf'], 'C': c_range}
grid = GridSearchCV(SVC(gamma='scale'), param, cv=3, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_estimator_)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [133]:
# 如果gamma=‘scale’传递，则使用1 /（n_features * X.var（））作为gamma的值
model = SVC(C=0.448, gamma= 'scale', class_weight='balanced')
model.fit(X_train, y_train)
print(model)
y_pre = model.predict(X_test)
f1 = f1_score(y_test, y_pre)
f1

SVC(C=0.448, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


0.7848101265822784