In [1]:
# -*- coding: UTF-8 -*-
"""
数据描述统计
预测测试集
评价统计性质
"""

# 保证脚本与Python3兼容
from __future__ import print_function

import os   #读取数据文件
import sys
import pymysql 
from sqlalchemy import create_engine
 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split  #划分训练集测试集使用
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.linear_model import LogisticRegression ,LogisticRegressionCV
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer 
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer #特征转换器
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.model_selection import GridSearchCV
%matplotlib inline 

import warnings

warnings.filterwarnings("ignore")

def readData(path):
    """
    使用pandas读取数据
    """
    data = pd.read_csv(path)
    cols = list(data.columns.values)
    return data[cols]

 
    
    
    
def toMysql(data,tablename):
    """
    dataframe数据保存到mysql
    """
    # pymysql初始化数据库连接
    db_info = {'user': 'root',
             'password': '123456',
            'host': '139.9.53.74',
            'port': 3306,
            'database': 'test'            }
    
    engine = create_engine('mysql+pymysql://%(user)s:%(password)s@%(host)s:%(port)d/%(database)s?charset=utf8' % db_info)
 
   # 将新建的DataFrame储存为MySQL中的数据表，不储存index列(index=False)
   # if_exists:
   # 1.fail:如果表存在，啥也不做
   # 2.replace:如果表存在，删了表，再建立一个新表，把数据插入
   # 3.append:如果表存在，把数据插入，如果表不存在创建一个表！！
    data.to_sql( tablename, con=engine, index=False, if_exists='replace')
    print("Write to MySQL successfully!")

def readMysql(tablename):
    """
    读取mysql数据
    """
    # pymysql初始化数据库连接
    db_info = {'user': 'root',
             'password': '123456',
            'host': '139.9.53.74',
            'port': 3306,
            'database': 'test'            }
    
    engine = create_engine('mysql+pymysql://%(user)s:%(password)s@%(host)s/%(database)s?charset=utf8'
                            % db_info, encoding='utf-8')
 
    sql = "SELECT * FROM {0} ".format(tablename)
    data = pd.read_sql(sql, con=  engine) 
    print("Read MySQL successfully!")
    return data
 
def visualData(data):
    """
    画直方图，直观了解数据
    """
    data.hist(
        rwidth=0.9, grid=True, figsize=(8, 8), alpha=0.6,bins=10, color="blue")
    plt.show()

def preprocess(data):
    """
    数据预处理阶段
    """
    
    

def transLabel(data):
    """
    将文字变量转化为数字变量
    """
    #data["TC_TYPE_1"] = pd.Categorical(data.TC_TYPE).codes
    #data["KD_TIME_LIMIT_1"] = pd.Categorical(data.KD_TIME_LIMIT).codes
   # data["KD_MEDIUM_1"] = pd.Categorical(data.KD_MEDIUM).codes
    #data["CUST_CREDIT_LEVEL_1"] = pd.Categorical(data.CUST_CREDIT_LEVEL).codes
   # data["KD_PAY_TYPE_1"] = pd.Categorical(data.KD_PAY_TYPE).codes
   # data["KD_RH_TYPE_1"] = pd.Categorical(data.KD_RH_TYPE).codes
    return data

def analyseData(data):
    """
    通过统计方法，了解数据性质
    """
    # 在Windows下运行此脚本需确保Windows下的命令提示符(cmd)能显示中文
    print("显示基本统计信息：")
    print(data.describe(include="all"))
    # 计算age, is_succ交叉报表,qcut按分位数分箱
    cross1 = pd.crosstab(pd.qcut(data["AGE"],  [0, .25, .5, .75, 1]), data["IS_SUCC"])
    print("显示age, is_succ交叉报表：")
    print(cross1)
    # 将交叉报表图形化
    props = lambda key: {"color": "0.45"} if  1 in key else {"color": "#C6E2FF"}
    mosaic(cross1[[1, 0]].stack(), properties=props)
    # 计算KD_INNET_MONTHS, IS_SUCC交叉报表
    cross2 = pd.crosstab(pd.cut(data["KD_INNET_MONTHS"], 5), data["IS_SUCC"])
    # 将交叉报表归一化，利于分析数据
    cross2_norm = cross2.div(cross2.sum(1).astype(float), axis=0)
    print("显示KD_INNET_MONTHS, IS_SUCC交叉报表：")
    print(cross2_norm)
    # 图形化归一化后的交叉报表
    cross2_norm.plot(kind="bar", color=["#C6E2FF", "0.45"], rot=0)
    
    #计算MAIN_GROUP_ID, IS_SUCC交叉报表
    cross3 = pd.crosstab(data["MAIN_GROUP_ID"], data["IS_SUCC"])
    print("显示MAIN_GROUP_ID, IS_SUCC交叉报表：")
    print(cross3)
    
    cross4 = pd.crosstab(data["TC_TYPE"], data["IS_SUCC"])
    print("显示TC_TYPE, IS_SUCC交叉报表：")
    print(cross4)
    
    cross5 = pd.crosstab(data["IS_BILL_USER"], data["IS_SUCC"])
    print("显示IS_BILL_USER, IS_SUCC交叉报表：")
    print(cross5)
    cross5.plot(kind="bar", color=["#C6E2FF", "0.45"], rot=0)
    
    cross6 = pd.crosstab(data["IS_AVAIL_USER"], data["IS_SUCC"])
    print("显示IS_AVAIL_USER, IS_SUCC交叉报表：")
    print(cross6)
    cross6.plot(kind="bar", color=["#C6E2FF", "0.45"], rot=0)
    
    cross7 = pd.crosstab(data["CUST_CREDIT_LEVEL"], data["IS_SUCC"])
    print("显示CUST_CREDIT_LEVEL, IS_SUCC交叉报表：")
    print(cross7)
    cross7.plot(kind="bar", color=["#C6E2FF", "0.45"], rot=0)
    
    plt.show()



#def trainModel(data):
    """
    搭建逻辑回归模型，并训练模型
    """
   # formula = "label_code ~ age + education_num + capital_gain + capital_loss + hours_per_week"
   # model = sm.Logit.from_formula(formula, data=data)
    #re = model.fit()
   # return re
    
 

def forward_train(data,label):
    
    remaining = set(data.columns)
    
    remaining.remove(label)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(label,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(label,
                                   ' + '.join(selected))
    model = sm.Logit.from_formula(formula, data=data)
    #model = smf.ols(formula, data).fit()
    re = model.fit()
    return re
     



def logitModel(X, Y):
    """
    搭建逻辑回归模型，并得到预测结果
    """
    # 为了消除惩罚项的干扰，将惩罚系数设为很大
    model = LogisticRegression(C=1e4)
    model.fit(X, Y.ravel())
    pred = model.predict(X)
    return pred

def modelSummary(re):
    """
    分析逻辑回归模型的统计性质
    """
    # 整体统计分析结果
    print(re.summary())
    # 用f test检验education_num的系数是否显著
    print("检验假设education_num的系数等于0：")
    print(re.f_test("education_num=0"))
    # 用f test检验两个假设是否同时成立
    print("检验假设education_num的系数等于0.32和hours_per_week的系数等于0.04同时成立：")
    print(re.f_test("education_num=0.32, hours_per_week=0.04"))
    

def visualize(ratios, predPositive, truePositive, aucs, accuracies):
    """
    将模型结果可视化
    """
    # 为在Matplotlib中显示中文，设置特殊字体
    plt.rcParams["font.sans-serif"]=["SimHei"]
    # 创建一个图形框
    fig = plt.figure(figsize=(12, 6), dpi=80)
    # 在图形框里画两幅图
    ax = fig.add_subplot(1, 2, 1)
    # 在Python3中，str不需要decode
    if sys.version_info[0] == 3:
        ax.plot(ratios, predPositive,
            label="%s" % "预测结果里类别1的个数")
        ax.plot(ratios, truePositive, "k--",
            label="%s" % "原始数据里类别1的个数")
    else:
        ax.plot(ratios, predPositive,
            label="%s" % "预测结果里类别1的个数".decode("utf-8"))
        ax.plot(ratios, truePositive, "k--",
            label="%s" % "原始数据里类别1的个数".decode("utf-8"))
    ax.set_xlim([0, 0.5])
    ax.invert_xaxis()
    legend = plt.legend(shadow=True, loc="best")
    ax1 = fig.add_subplot(1, 2, 2)
    # 在Python3中，str不需要decode
    if sys.version_info[0] == 3:
        ax1.plot(ratios, aucs, "r", label="%s" % "曲线下面积（AUC）")
        ax1.plot(ratios, accuracies, "k-.", label="%s" % "准确度（ACC）")
    else:
        ax1.plot(ratios, aucs, "r", label="%s" % "曲线下面积（AUC）".decode("utf-8"))
        ax1.plot(ratios, accuracies, "k-.", label="%s" % "准确度（ACC）".decode("utf-8"))
    ax1.set_xlim([0, 0.5])
    ax1.set_ylim([0.5, 1])
    ax1.invert_xaxis()
    legend = plt.legend(shadow=True, loc="best")
    plt.show()

    

def interpretModel(re):
    """
    理解模型结果

    参数
    ----
    re ：BinaryResults，训练好的逻辑回归模型
    """
    conf = re.conf_int()
    conf['OR'] = re.params
    # 计算各个变量对事件发生比的影响
    # conf里面的三列，分别对应着估计值的下界、上界和估计值本身
    conf.columns = ['2.5%', '97.5%', 'OR']
    print("各个变量对事件发生比的影响：")
    print(np.exp(conf))
    # 计算各个变量的边际效应
    print("各个变量的边际效应：")
    print(re.get_margeff(at="overall").summary())


def makePrediction(re, testSet, alpha=0.5):
    """
    使用训练好的模型对测试数据做预测
    """
    # 关闭pandas有关chain_assignment的警告
    pd.options.mode.chained_assignment = None
    # 计算事件发生的概率
    testSet["prob"] = re.predict(testSet)
    print("事件发生概率（预测概率）大于0.6的数据个数：")
    print(testSet[testSet["prob"] > 0.6].shape[0])  # 输出值为576
    print("事件发生概率（预测概率）大于0.5的数据个数：")
    print(testSet[testSet["prob"] > 0.5].shape[0])  # 输出值为834
    # 根据预测的概率，得出最终的预测
    testSet["pred"] = testSet.apply(lambda x: 1 if x["prob"] > alpha else 0, axis=1)
    return testSet



def evaluateModel(Y, pred):
    """
    评估模型效果，其中包括ACC，AUC以及预测结果中类别1的个数
    """
    predPositive = []
    truePositive = []
    aucs = []
    accuracies = []
    ratios = []
    for i in range(len(Y)):
        ratios.append(len(Y[i][Y[i]>0]) / float(len(Y[i])))
        predPositive.append(len(pred[i][pred[i]>0]))
        truePositive.append(len(Y[i][Y[i]>0]))
        fpr, tpr, _ = metrics.roc_curve(Y[i], pred[i])
        accuracies.append(metrics.accuracy_score(Y[i], pred[i]))
        aucs.append(metrics.auc(fpr, tpr))
    visualize(ratios, predPositive, truePositive, aucs, accuracies)

def evaluation(re):
    """
    计算预测结果的查准查全率以及f1

    参数
    ----
    re ：DataFrame，预测结果，里面包含两列：真实值‘IS_SUCC’、预测值‘pred’
    """
    bins = np.array([0, 0.5, 1])
    label = re["IS_SUCC"]
    pred = re["pred"]
    tn, fp, fn, tp = np.histogram2d(label, pred, bins=bins)[0].flatten()
    precision = tp / (tp + fp)  # 0.707
    recall = tp / (tp + fn)  # 0.374
    f1 = 2 * precision * recall / (precision + recall)  # 0.490
    print("查准率: %.3f, 查全率: %.3f, f1: %.3f" % (precision, recall, f1))
    
if __name__ == "__main__":
    # 设置显示格式
    pd.set_option('display.width', 1000)
    homePath = os.path.dirname(os.path.abspath('__file__'))
    # Windows下的存储路径与Linux并不相同
    if os.name == "nt":
        dataPath = "%s\\test-1.csv" % homePath
    else:
        dataPath = "%s/test-1.csv" % homePath
    data = readData(dataPath)
    
     
    data.iloc[:,[0,1]].to_csv('res1.csv',index=0)
    data.iloc[:,[0,2]].to_csv('res2.csv',index=0)
    data.iloc[:,[0,3]].to_csv('res3.csv',index=0)
    data.iloc[:,[0,4]].to_csv('res4.csv',index=0)
    data.iloc[:,[0,5]].to_csv('res5.csv',index=0)
    
    
    



ImportError: cannot import name 'ConvergenceWarning' from 'sklearn.linear_model.coordinate_descent' (/usr/local/lib/python3.7/site-packages/sklearn/linear_model/coordinate_descent.py)