In [9]:
import sys
from time import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf, SparkContext
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import MulticlassMetrics

In [10]:
def SetLogger( sc ):
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
    logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
    logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

def SetPath(sc):
    global Path
    if sc.master[0:5]=="local" :
        Path="file:/home/hduser/pythonwork/PythonProject/"
    else:
        Path="hdfs://hdnn:8020/user/hduser/"
#如果您要在cluster模式執行(hadoop yarn 或Spark Stand alone)，請依照書上說明，先上傳檔案至HDFS目錄
    
def get_mapping(rdd, idx):
    return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()

def extract_label(record):
    label=(record[-1])
    return float(label)-1

def convert_float(x):
    return (0 if x=="?" else float(x))

def extract_features(record,featureEnd):
    numericalFeatures=[convert_float(field)  for  field in record[0: featureEnd]]
    return  numericalFeatures

In [11]:
def PrepareData(sc): 
    #----------------------1.匯入並轉換資料-------------
    print("開始匯入資料...")
    rawData = sc.textFile(Path+"data/covtype.data")
    print("共計：" + str(rawData.count()) + "筆")
    lines = rawData.map(lambda x: x.split(","))
    #----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]-------------
    labelpointRDD = lines.map(lambda r: LabeledPoint(
                                                     extract_label(r), 
                                                     extract_features(r,len(r) - 1)))
    #----------------------3.以隨機方式將資料分為3部份並且回傳-------------
    (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("將資料分trainData:" + str(trainData.count())+\
             "   validationData:" + str(validationData.count()) +\
             "   testData:" + str(testData.count()))
    print (labelpointRDD.first())
    return (trainData, validationData, testData) 

def PredictData(sc,model): 
    #----------------------1.匯入並轉換資料-------------
    rawData = sc.textFile(Path+"data/covtype.data")
    print("共計：" + str(rawData.count()) + "筆")
    print("建立訓練評估所需資料 RDD...")
    lines = rawData.map(lambda x: x.split(","))
    #----------------------2.建立預測所需資料 RDD[LabeledPoint]-------------
    labelpointRDD = lines.map(lambda r: LabeledPoint(
                              extract_label(r), extract_features(r,len(r) - 1)))
    #----------------------3.進行預測並顯示結果-------------
    for lp in labelpointRDD.take(100):
        predict = model.predict(lp.features)
        label=lp.label
        features=lp.features
        result = ("正確" if  (label == predict) else "錯誤")
        print("土地條件：海拔:" + str(features[0]) + 
                 " 方位:" + str(features[1]) + 
                 " 斜率:" + str(features[2]) + 
                 " 水源垂直距離:" + str(features[3]) + 
                 " 水源水平距離:" + str(features[4]) + 
                 " 9點時陰影:" + str(features[5]) + 
                 "....==>預測:" + str(predict) +
                 " 實際:" + str(label) + "結果:" + result)
 


def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels=score.zip(validationData.map(lambda p: p.label))
    metrics = MulticlassMetrics(scoreAndLabels)
    accuracy = metrics.accuracy
    return( accuracy)
 

def trainEvaluateModel(trainData,validationData,
                                          impurityParm, maxDepthParm, maxBinsParm):
    startTime = time()
    model = DecisionTree.trainClassifier(trainData,\
                                    numClasses=7, categoricalFeaturesInfo={}, \
                                    impurity=impurityParm, 
                                    maxDepth=maxDepthParm, 
                                    maxBins=maxBinsParm)
    accuracy = evaluateModel(model, validationData)
    duration = time() - startTime
    print    ("訓練評估：使用參數" + \
                " impurityParm= %s"%impurityParm+ \
                " maxDepthParm= %s"%maxDepthParm+ \
                " maxBinsParm = %d."%maxBinsParm + \
                 " 所需時間=%d"%duration + \
                 " 結果accuracy = %f " % accuracy )
    return (accuracy,duration, impurityParm, maxDepthParm, maxBinsParm,model)




def evalParameter(trainData, validationData, evaparm,impurityList, maxDepthList, maxBinsList):
    metrics = [trainEvaluateModel(trainData, validationData,  impurity,numIter,  maxBins  ) 
               for impurity in impurityList for numIter in maxDepthList  for maxBins in maxBinsList ]
    if evaparm=="impurity":
        IndexList=impurityList[:]
    elif evaparm=="maxDepth":
        IndexList=maxDepthList[:]
    elif evaparm=="maxBins":
        IndexList=maxBinsList[:]
    df = pd.DataFrame(metrics,index=IndexList,
               columns=['accuracy', 'duration','impurity', 'maxDepth', 'maxBins','model'])
    
    showchart(df,evaparm,'accuracy','duration',0.6,1.0 )
    
    
def showchart(df,evalparm ,barData,lineData,yMin,yMax):
    ax = df[barData].plot(kind='bar', titl =evalparm,figsize=(10,6),legend=True, fontsize=12)
    ax.set_xlabel(evalparm,fontsize=12)
    ax.set_ylim([yMin,yMax])
    ax.set_ylabel(barData,fontsize=12)
    ax2 = ax.twinx()
    ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r')
    plt.show()
    
    
def evalAllParameter(training_RDD, validation_RDD, impurityList, maxDepthList, maxBinsList):    
    metrics = [trainEvaluateModel(trainData, validationData,  impurity,numIter,  maxBins  ) 
                        for impurity in impurityList for numIter in maxDepthList  for maxBins in maxBinsList ]
    Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True)
    bestParameter=Smetrics[0]
    print("調校後最佳參數：impurity:" + str(bestParameter[2]) + 
             "  ,maxDepth:" + str(bestParameter[3]) + 
            "  ,maxBins:" + str(bestParameter[4])   + 
            "  ,結果accuracy = " + str(bestParameter[0]))
    return bestParameter[5]

def  parametersEval(training_RDD, validation_RDD):
    print("----- 評估impurity參數使用 ---------")
    evalParameter(trainData, validationData,"impurity", 
                              impurityList=["gini", "entropy"],   
                              maxDepthList=[10],  
                              maxBinsList=[10 ])  

    print("----- 評估maxDepth參數使用 ---------")
    evalParameter(trainData, validationData,"maxDepth", 
                              impurityList=["gini"],                    
                              maxDepthList=[3, 5, 10, 15, 20, 25],    
                              maxBinsList=[10])   

    print("----- 評估maxBins參數使用 ---------")
    evalParameter(trainData, validationData,"maxBins", 
                              impurityList=["gini"],      
                              maxDepthList =[10],        
                              maxBinsList=[3, 5, 10, 50, 100, 200 ])
    


def CreateSparkContext():
    #sparkConf = SparkConf()                                                       \
    #                     .setAppName("RunDecisionTreeBinary")                         \
    #                     .set("spark.ui.showConsoleProgress", "false") 
    #sc = SparkContext(conf = sparkConf)
    print ("master="+sc.master)    
    SetLogger(sc)
    SetPath(sc)
    return (sc)

In [12]:
if __name__ == "__main__":
    print("RunDecisionTreeMulti")
    sc=CreateSparkContext()
    print("==========資料準備階段===============")
    (trainData, validationData, testData) =PrepareData(sc)
    trainData.persist(); validationData.persist(); testData.persist()
    print("==========訓練評估階段===============")
    (AUC,duration, impurityParm, maxDepthParm, maxBinsParm,model)= \
        trainEvaluateModel(trainData, validationData, "entropy", 15,50)
    if (len(sys.argv) == 2) and (sys.argv[1]=="-e"):
        parametersEval(trainData, validationData)
    elif   (len(sys.argv) == 2) and (sys.argv[1]=="-a"): 
        print("-----所有參數訓練評估找出最好的參數組合---------")  
        model=evalAllParameter(trainData, validationData,
                          ["gini", "entropy"],
                          [3, 5, 10, 15],
                          [3, 5, 10, 50 ])
                
    print("==========測試階段===============")
    accuracy = evaluateModel(model, testData)
    print("使用testata測試最佳模型,結果 accuracy:" + str(accuracy))
    print("==========預測資料===============")
    PredictData(sc, model)
    #print   model.toDebugString()
    

RunDecisionTreeMulti
master=spark://spkma:7077
開始匯入資料...
共計：581012筆
將資料分trainData:465043   validationData:58131   testData:57838
(4.0,[2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
訓練評估：使用參數 impurityParm= entropy maxDepthParm= 15 maxBinsParm = 50. 所需時間=51 結果accuracy = 0.855430 
使用testata測試最佳模型,結果 accuracy:0.8541270445036135
共計：581012筆
建立訓練評估所需資料 RDD...
土地條件：海拔:2596.0 方位:51.0 斜率:3.0 水源垂直距離:258.0 水源水平距離:0.0 9點時陰影:510.0....==>預測:4.0 實際:4.0結果:正確
土地條件：海拔:2590.0 方位:56.0 斜率:2.0 水源垂直距離:212.0 水源水平距離:-6.0 9點時陰影:390.0....==>預測:4.0 實際:4.0結果:正確
土地條件：海拔:2804.0 方位:139.0 斜率:9.0 水源垂直距離:268.0 水源水平距離:65.0 9點時陰影:3180.0....==>預測:1.0 實際:1.0結果:正確
土地條件：海拔:2785.0 方位:155.0 斜率:18.0 水源垂直距離:242.0 水源水平距離:118.0 9點時陰影:3090.0....==>預測:1.0 實際:1.0結果:正確
土地條件：海拔:2595.0 方位:45.0 斜率:2.0 水源垂直距離:153.0 水源水平距離:-1.0 9點時陰影:391.0....==>預測:4.0 實際:

土地條件：海拔:3073.0 方位:173.0 斜率:12.0 水源垂直距離:108.0 水源水平距離:-3.0 9點時陰影:6836.0....==>預測:1.0 實際:1.0結果:正確
土地條件：海拔:2978.0 方位:71.0 斜率:10.0 水源垂直距離:426.0 水源水平距離:85.0 9點時陰影:5742.0....==>預測:1.0 實際:1.0結果:正確
土地條件：海拔:2860.0 方位:31.0 斜率:10.0 水源垂直距離:295.0 水源水平距離:98.0 9點時陰影:3644.0....==>預測:0.0 實際:0.0結果:正確
土地條件：海拔:3067.0 方位:164.0 斜率:11.0 水源垂直距離:85.0 水源水平距離:7.0 9點時陰影:6811.0....==>預測:1.0 實際:1.0結果:正確
土地條件：海拔:2804.0 方位:72.0 斜率:5.0 水源垂直距離:543.0 水源水平距離:61.0 9點時陰影:3115.0....==>預測:1.0 實際:1.0結果:正確
土地條件：海拔:2562.0 方位:59.0 斜率:3.0 水源垂直距離:0.0 水源水平距離:0.0 9點時陰影:1116.0....==>預測:1.0 實際:1.0結果:正確
土地條件：海拔:2567.0 方位:34.0 斜率:9.0 水源垂直距離:190.0 水源水平距離:16.0 9點時陰影:1136.0....==>預測:1.0 實際:1.0結果:正確
