In [58]:
import numpy as np
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn import metrics

## 准备数据

In [70]:
class ChurnPredWithGBDT:
    
    def __init__(self):
        self.file = "./data/new-churn.csv"
        self.new_file = "./data/new_churn.csv"
        self.new_file_1 = "./data/new_churn1.csv"
        self.feature_dict = self.feature_dict()
        self.data = self.feature_transform()
        self.data_1 = self.feature_transform_1()
        self.features, self.train, self.test = self.split_data()
        
    # 空缺值以0填充
    def isNone(self, value):
        if value == " " or value is None:
            return "0.0"
        else:
            return value
    
    # 定义特征转换字典
    def feature_dict(self):
        feature_dict = {
            "gender":{"Male":"1", "Female":"0"},
            "Partner":{"Yes":"1", "No":"0"},
            "Dependents":{"Yes":"1", "No":"0"},
            "PhoneService":{"Yes":"1", "No":"0"},
            "MultipleLines":{"Yes":"1","No":"0","No phone service":"2"},
            "InternetService":{"DSL":"1","Fiber optic":"2","No":"0"},
            "OnlineSecurity":{"Yes":"1","No":"0","No internet service":"2"},
            "OnlineBackup":{"Yes":"1","No":"0","No internet service":"2"},
            "DeviceProtection":{"Yes":"1","No":"0","No internet service":"2"},
            "TechSupport":{"Yes":"1","No":"0","No internet service":"2"},
            "StreamingTV":{"Yes":"1","No":"0","No internet service":"2"},
            "StreamingMovies":{"Yes":"1","No":"0","No internet service":"2"},
            "Contract":{"Month-to-month":"0","One year":"1","Two year":"2"},
            "PaperlessBilling":{"Yes":"1","No":"0"},
            "PaymentMethod":{
                "Electronic check":"0",
                "Mailed check":"1",
                "Bank transfer (automatic)":"2",
                "Credit card (automatic)":"3",
            },
            "Churn":{"Yes":"1","No":"0"},
        }
        return feature_dict

    # 特征转换(方法一)
    def feature_transform(self):
        if not os.path.exists(self.new_file_1):
            df = pd.read_csv(self.file)
            print("Start Feature Transform …")
            # 定义特征转换字典
            feature_dict = self.feature_dict
            for k in df.keys():
                if k in feature_dict:
                    df[k] = df[k].apply(lambda x: feature_dict[k][x])
                else:
                    df[k] = df[k].apply(lambda x: self.isNone(x))
            df.to_csv(self.new_file_1)
        return pd.read_csv(self.new_file_1)
    
    # 特征转换(方法二)
    def feature_transform_1(self):
        if not os.path.exists(self.new_file):
            print("Start Feature Transform …")
            # 定义特征转换字典
            feature_dict = self.feature_dict
            fw = open(self.new_file, "w")
            fw.write("customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,"
                     "InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,"
                     "StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn")
            fw.write("\n")
            for line in open(self.file, "r").readlines():
                if line.startswith("customerID"):
                    continue
                customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,\
                OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,\
                MonthlyCharges,TotalCharges,Churn = line.strip().split(",")
                _list = list()
                _list.append(customerID)
                _list.append(self.isNone(feature_dict["gender"][gender]))
                _list.append(self.isNone(SeniorCitizen))
                _list.append(self.isNone(feature_dict["Partner"][Partner]))
                _list.append(self.isNone(feature_dict["Dependents"][Dependents]))
                _list.append(self.isNone(tenure))
                _list.append(self.isNone(feature_dict["PhoneService"][PhoneService]))
                _list.append(self.isNone(feature_dict["MultipleLines"][MultipleLines]))
                _list.append(self.isNone(feature_dict["InternetService"][InternetService]))
                _list.append(self.isNone(feature_dict["OnlineSecurity"][OnlineSecurity]))
                _list.append(self.isNone(feature_dict["OnlineBackup"][OnlineBackup]))
                _list.append(self.isNone(feature_dict["DeviceProtection"][DeviceProtection]))
                _list.append(self.isNone(feature_dict["TechSupport"][TechSupport]))
                _list.append(self.isNone(feature_dict["StreamingTV"][StreamingTV]))
                _list.append(self.isNone(feature_dict["StreamingMovies"][StreamingMovies]))
                _list.append(self.isNone(feature_dict["Contract"][Contract]))
                _list.append(self.isNone(feature_dict["PaperlessBilling"][PaperlessBilling]))
                _list.append(self.isNone(feature_dict["PaymentMethod"][PaymentMethod]))
                _list.append(self.isNone(MonthlyCharges))
                _list.append(self.isNone(TotalCharges))
                _list.append(feature_dict["Churn"][Churn])
                fw.write(",".join(_list))
                fw.write("\n")
            return pd.read_csv(self.new_file)
        else:
            return pd.read_csv(self.new_file)
        
    # 拆分数据集
    def split_data(self):
        data = self.feature_transform()
        features = [x for x in data.keys() if x not in ["customerID", "Churn"]]
        train, test = train_test_split(data, test_size=0.1, random_state=40)
        return features, train, test
    
    # 使用GBDT算法训练模型
    def train_model(self):
        print("Start Train Model...")
        x_train = self.train[self.features]
        y_train = self.train["Churn"]
        gdbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=200, max_depth=6)
        gdbc.fit(x_train, y_train)
        print("End Train Model.")
        return gdbc
    
    # 效果评估
    def evaluate(self, gbdt):
        x_test = self.test[self.features]
        y_test = self.test["Churn"]
        y_pred = gbdt.predict_proba(x_test)
        new_y_pred = list()
        for y in y_pred:
            new_y_pred.append(1 if y[1] > 0.5 else 0)
        mse = mean_squared_error(y_test, new_y_pred)
        print("MSE: %.4f" % mse)
        accuracy = metrics.accuracy_score(y_test, new_y_pred)
        print("Accuracy: %.4g" % accuracy)
        auc = metrics.roc_auc_score(y_test, new_y_pred)
        print("AUC Score: %.4g" % auc)

数据集拆分

In [71]:
x = ready_data()
gdbc = x.train_model()

Start Train Model...
End Train Model.


In [72]:
x.evaluate(gdbc)

MSE: 0.2326
Accuracy: 0.7674
AUC Score: 0.691
