In [1]:
# -*- coding: UTF-8 -*-
import sys
from time import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf, SparkContext
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
def SetPath(sc):
    global Path
    if sc.master[0:5]=="local" :
        Path="file:/home/hduser/workspace/Classification/"
    else:   
        Path="hdfs://master:9000/user/hduser/"


In [3]:
def extract_label(record):
    label=(record[-1])
    return float(label)-1


In [4]:
def extract_features(record,featureEnd):
    # 这里的featureEnd就是 len(r) - 1
    numericalFeatures=[convert_float(field)  for  field in record[0: featureEnd]]
    return  numericalFeatures

In [5]:
def convert_float(x):
    return (0 if x=="?" else float(x))


In [6]:
def PrepareData(sc): 
    #----------------------1.导入并转换数据-------------
    print("开始导入数据...")
    rawData = sc.textFile(Path+"data/covtype.data")
    print("共计：" + str(rawData.count()) + "项")
    lines = rawData.map(lambda x: x.split(","))
    #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]-------------
    print("建立训练评估所需数据...")
    labelpointRDD = lines.map(lambda r: LabeledPoint(
                                                     extract_label(r), 
                                                     extract_features(r,len(r) - 1)))
#     categoriesMap = lines.map(lambda fields: fields[3]).\
#                                    distinct().zipWithIndex().collectAsMap()
#     labelpointRDD = lines.map( lambda r:
#                LabeledPoint(
#                       extract_label(r), 
#                       extract_features(r,len(r) - 1)
    

    #----------------------3.以随机方式将数据分为3个部分并且返回-------------
    (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("将数据分trainData:" + str(trainData.count())+\
             "   validationData:" + str(validationData.count()) +\
             "   testData:" + str(testData.count()))
    print labelpointRDD.first()
    return (trainData, validationData, testData) 

In [7]:
def PredictData(sc,model): 
    #----------------------1.导入并转换数据-------------
    rawData = sc.textFile(Path+"data/covtype.data")
    print("共计：" + str(rawData.count()) + "项")
    print("建立训练评估所需数据 RDD...")
    lines = rawData.map(lambda x: x.split(","))
    #----------------------2.建立预测所需数据 RDD[LabeledPoint]-------------
    labelpointRDD = lines.map(lambda r: LabeledPoint(
                              extract_label(r), extract_features(r,len(r) - 1)))
    #----------------------3.进行预测并显示结果-------------
    for lp in labelpointRDD.take(100):
        predict = model.predict(lp.features)
        label=lp.label
        features=lp.features
        result = ("正确" if  (label == predict) else "错误")
        print("土地条件：海拔:" + str(features[0]) + 
                 " 方位:" + str(features[1]) + 
                 " 斜率:" + str(features[2]) + 
                 " 水源垂直距离:" + str(features[3]) + 
                 " 水源水平距离:" + str(features[4]) + 
                 " 9点时阴影:" + str(features[5]) + 
                 "....==>预测:" + str(predict) +
                 " 实际:" + str(label) + "结果:" + result)

In [8]:
def trainEvaluateModel(trainData,validationData,impurityParm, maxDepthParm, maxBinsParm):
    startTime = time()
    model = DecisionTree.trainClassifier(trainData,\
                                    numClasses=7, categoricalFeaturesInfo={}, \
                                    impurity=impurityParm, 
                                    maxDepth=maxDepthParm, 
                                    maxBins=maxBinsParm)
    accuracy = evaluateModel(model, validationData)
    duration = time() - startTime
    print    "训练评估：使用参数" + \
                " impurityParm= %s"%impurityParm+ \
                " maxDepthParm= %s"%maxDepthParm+ \
                " maxBinsParm = %d."%maxBinsParm + \
                 " 所需时间=%d"%duration + \
                 " 结果accuracy = %f " % accuracy 
    return (accuracy,duration, impurityParm, maxDepthParm, maxBinsParm,model)

In [9]:
def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels=score.zip(validationData.map(lambda p: p.label))
    metrics = MulticlassMetrics(scoreAndLabels)
    accuracy = metrics.accuracy
    return( accuracy)

In [10]:
def evalParameter(trainData, validationData, evaparm,impurityList, maxDepthList, maxBinsList):
    metrics = [trainEvaluateModel(trainData, validationData,  impurity,numIter,  maxBins  ) 
               for impurity in impurityList for numIter in maxDepthList  for maxBins in maxBinsList ]
    if evaparm=="impurity":
        IndexList=impurityList[:]
    elif evaparm=="maxDepth":
        IndexList=maxDepthList[:]
    elif evaparm=="maxBins":
        IndexList=maxBinsList[:]
    df = pd.DataFrame(metrics,index=IndexList,
               columns=['accuracy', 'duration','impurity', 'maxDepth', 'maxBins','model'])
    
    showchart(df,evaparm,'accuracy','duration',0.6,1.0 )

In [11]:
def evalAllParameter(training_RDD, validation_RDD, impurityList, maxDepthList, maxBinsList):    
    metrics = [trainEvaluateModel(trainData, validationData,  impurity,numIter,  maxBins  ) 
                        for impurity in impurityList for numIter in maxDepthList  for maxBins in maxBinsList ]
    Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True)
    bestParameter=Smetrics[0]
    print("调校后最佳参数：impurity:" + str(bestParameter[2]) + 
             "  ,maxDepth:" + str(bestParameter[3]) + 
            "  ,maxBins:" + str(bestParameter[4])   + 
            "  ,结果accuracy = " + str(bestParameter[0]))
    return bestParameter[5]

In [12]:
def showchart(df,evalparm ,barData,lineData,yMin,yMax):
    ax = df[barData].plot(kind='bar', titl =evalparm,figsize=(10,6),legend=True, fontsize=12)
    ax.set_xlabel(evalparm,fontsize=12)
    ax.set_ylim([yMin,yMax])
    ax.set_ylabel(barData,fontsize=12)
    ax2 = ax.twinx()
    ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r')
    plt.show()

In [13]:
def CreateSparkContext():
    sparkConf = SparkConf()                                                       \
                         .setAppName("DecisionTreeMulti")                         \
                         .set("spark.ui.showConsoleProgress", "false") 
    sc = SparkContext(conf = sparkConf)
    print ("master="+sc.master)    
    SetPath(sc)
    return (sc)

In [14]:
if __name__ == "__main__":
    print("RunDecisionTreeMulti")
    sc.stop()
    sc=CreateSparkContext()
    print("==========数据准备阶段===============")
    (trainData, validationData, testData) =PrepareData(sc)
    trainData.persist(); validationData.persist(); testData.persist()
    print("==========训练评估阶段===============")
    (AUC,duration, impurityParm, maxDepthParm, maxBinsParm,model)= \
        trainEvaluateModel(trainData, validationData, "entropy", 15,50)
    flag_mark = 2
    
    if flag_mark == 1:
        parametersEval(trainData, validationData)
    elif flag_mark !=1:
        print("-----所有参数训练评估找出最好的参数组合---------")  
        model=evalAllParameter(trainData, validationData,
                          ["gini", "entropy"],
                          [3, 5, 10, 15],
                          [3, 5, 10, 50 ])
                
    print("==========测试阶段===============")
    accuracy = evaluateModel(model, testData)
    print("使用test Data测试最佳模型,结果 accuracy:" + str(accuracy))
    print("==========预测数据===============")
    PredictData(sc, model)
    #print   model.toDebugString()

RunDecisionTreeMulti
master=local[*]
开始导入数据...
共计：581012项
建立训练评估所需数据...
将数据分trainData:465143   validationData:57737   testData:58132
(4.0,[2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
训练评估：使用参数 impurityParm= entropy maxDepthParm= 15 maxBinsParm = 50. 所需时间=44 结果accuracy = 0.853266 
-----所有参数训练评估找出最好的参数组合---------
训练评估：使用参数 impurityParm= gini maxDepthParm= 3 maxBinsParm = 3. 所需时间=29 结果accuracy = 0.659283 
训练评估：使用参数 impurityParm= gini maxDepthParm= 3 maxBinsParm = 5. 所需时间=29 结果accuracy = 0.675494 
训练评估：使用参数 impurityParm= gini maxDepthParm= 3 maxBinsParm = 10. 所需时间=28 结果accuracy = 0.670350 
训练评估：使用参数 impurityParm= gini maxDepthParm= 3 maxBinsParm = 50. 所需时间=29 结果accuracy = 0.673814 
训练评估：使用参数 impurityParm= gini maxDepthParm= 5 maxBinsParm = 3. 所需时间=28 结果accuracy = 0.683981 
训练评估：使用参数 impurityParm= gini 

土地条件：海拔:2489.0 方位:11.0 斜率:4.0 水源垂直距离:175.0 水源水平距离:13.0 9点时阴影:840.0....==>预测:4.0 实际:4.0结果:正确
土地条件：海拔:2489.0 方位:42.0 斜率:6.0 水源垂直距离:162.0 水源水平距离:13.0 9点时阴影:810.0....==>预测:4.0 实际:4.0结果:正确
土地条件：海拔:2490.0 方位:75.0 斜率:5.0 水源垂直距离:134.0 水源水平距离:17.0 9点时阴影:810.0....==>预测:4.0 实际:4.0结果:正确
土地条件：海拔:2952.0 方位:107.0 斜率:11.0 水源垂直距离:42.0 水源水平距离:7.0 9点时阴影:5845.0....==>预测:0.0 实际:1.0结果:错误
土地条件：海拔:2705.0 方位:90.0 斜率:8.0 水源垂直距离:134.0 水源水平距离:22.0 9点时阴影:2023.0....==>预测:1.0 实际:1.0结果:正确
土地条件：海拔:2507.0 方位:40.0 斜率:7.0 水源垂直距离:153.0 水源水平距离:10.0 9点时阴影:930.0....==>预测:4.0 实际:4.0结果:正确
土地条件：海拔:2500.0 方位:49.0 斜率:14.0 水源垂直距离:150.0 水源水平距离:27.0 9点时阴影:870.0....==>预测:4.0 实际:4.0结果:正确
土地条件：海拔:2493.0 方位:63.0 斜率:10.0 水源垂直距离:127.0 水源水平距离:20.0 9点时阴影:840.0....==>预测:4.0 实际:4.0结果:正确
土地条件：海拔:2509.0 方位:59.0 斜率:7.0 水源垂直距离:134.0 水源水平距离:10.0 9点时阴影:900.0....==>预测:4.0 实际:4.0结果:正确
土地条件：海拔:2919.0 方位:13.0 斜率:13.0 水源垂直距离:90.0 水源水平距离:6.0 9点时阴影:5321.0....==>预测:0.0 实际:0.0结果:正确
土地条件：海拔:2740.0 方位:54.0 斜率:6.0 水源垂直距离:218.0 水源水平距离:42.0 9点时阴影:2287.0....==>预测