In [5]:
import sys
from time import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf, SparkContext
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler

In [6]:
def SetLogger( sc ):
    logger = sc._jvm.org.apache.log4j
    logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
    logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
    logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)    

def SetPath(sc):
    global Path
    if sc.master[0:5]=="local" :
        Path="file:/home/hduser/pythonwork/PythonProject/"
    else:
        Path="hdfs://hdnn:8020/user/hduser/"
#如果您要在cluster模式執行(hadoop yarn 或Spark Stand alone)，請依照書上說明，先上傳檔案至HDFS目錄

def get_mapping(rdd, idx):
    return rdd.map(lambda fields: fields[idx]).distinct().zipWithIndex().collectAsMap()

def extract_label(record):
    label=(record[-1])
    return float(label)

def extract_features(field,categoriesMap,featureEnd):
    categoryIdx = categoriesMap[field[3]]
    categoryFeatures = np.zeros(len(categoriesMap))
    categoryFeatures[categoryIdx] = 1
    numericalFeatures=[convert_float(field)  for  field in field[4: featureEnd]]    
    return  np.concatenate(( categoryFeatures, numericalFeatures))

def convert_float(x):
    ret=(0 if x=="?" else float(x))
    return(0 if ret<0 else ret)

In [7]:
def PrepareData(sc): 
    #----------------------1.匯入並轉換資料-------------
    print("開始匯入資料...")
    rawDataWithHeader = sc.textFile(Path+"data/train.tsv")
    header = rawDataWithHeader.first() 
    rawData = rawDataWithHeader.filter(lambda x:x !=header)    
    rData=rawData.map(lambda x: x.replace("\"", ""))    
    lines = rData.map(lambda x: x.split("\t"))
    print("共計：" + str(lines.count()) + "筆")
    #----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]-------------
    print ("標準化之前：")        
    categoriesMap = lines.map(lambda fields: fields[3]). \
                                        distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r:  extract_label(r))
    featureRDD = lines.map(lambda r:  extract_features(r,categoriesMap,len(r) - 1))
    for i in featureRDD.first():
        print (str(i)+","),
    print ("")       
    
    print ("標準化之後：")    
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD)
    ScalerFeatureRDD=stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print (str(i)+","),        
                
    labelpoint=labelRDD.zip(ScalerFeatureRDD)
    labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    
    #----------------------3.以隨機方式將資料分為3部份並且回傳-------------
    (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("將資料分trainData:" + str(trainData.count()) + 
              "   validationData:" + str(validationData.count()) +
              "   testData:" + str(testData.count()))
    return (trainData, validationData, testData, categoriesMap) #回傳資料

    
def PredictData(sc,model,categoriesMap): 
    print("開始匯入資料...")
    rawDataWithHeader = sc.textFile(Path+"data/test.tsv")
    header = rawDataWithHeader.first() 
    rawData = rawDataWithHeader.filter(lambda x:x !=header)    
    rData=rawData.map(lambda x: x.replace("\"", ""))    
    lines = rData.map(lambda x: x.split("\t"))
    print("共計：" + str(lines.count()) + "筆")
    dataRDD = lines.map(lambda r:  ( r[0]  ,
                            extract_features(r,categoriesMap,len(r) )))
    DescDict = {
           0: "暫時性網頁(ephemeral)",
           1: "長青網頁(evergreen)"
     }
    for data in dataRDD.take(10):
        predictResult = model.predict(data[1])
        print (" 網址：  " +str(data[0])+"\n" + "             ==>預測:"+ str(predictResult)+ " 說明:"+DescDict[predictResult] +"\n")

def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels=score.zip(validationData \
                                   .map(lambda p: p.label))  \
                                   .map(lambda t: (float(t[0]),float(t[1])) )
    metrics = BinaryClassificationMetrics(scoreAndLabels)
    AUC=metrics.areaUnderROC
    return( AUC)


def trainEvaluateModel(trainData,validationData,lambdaParam):
    startTime = time()
    model = NaiveBayes.train(trainData,   lambdaParam)
    AUC = evaluateModel(model, validationData)
    duration = time() - startTime
    print    ("訓練評估：使用參數" + " lambda="+str( lambdaParam) + " 所需時間="+str(duration) + " 結果AUC = " + str(AUC)) 
    return (AUC,duration,  lambdaParam,model)


def evalParameter(trainData, validationData, evalparm,
                  lambdaParamList):
    
    metrics = [trainEvaluateModel(trainData, validationData,regParam ) 
                                  for regParam in  lambdaParamList]
    
    evalparm="lambdaParam"
    IndexList=lambdaParamList
    
    df = pd.DataFrame(metrics,index=IndexList,
            columns=['AUC', 'duration',' lambdaParam','model'])
    showchart(df,evalparm,'AUC','duration',0.5,0.7 )
    
def showchart(df,evalparm ,barData,lineData,yMin,yMax):
    ax = df[barData].plot(kind='bar', title =evalparm,figsize=(10,6),legend=True, fontsize=12)
    ax.set_xlabel(evalparm,fontsize=12)
    ax.set_ylim([yMin,yMax])
    ax.set_ylabel(barData,fontsize=12)
    ax2 = ax.twinx()
    ax2.plot(df[[lineData ]].values, linestyle='-', marker='o', linewidth=2.0,color='r')
    plt.show()
def evalAllParameter(training_RDD, validation_RDD, lambdaParamList):    
    metrics = [trainEvaluateModel(trainData, validationData,  lambdaParam  ) 
                        for lambdaParam in lambdaParamList  ]
    Smetrics = sorted(metrics, key=lambda k: k[0], reverse=True)
    bestParameter=Smetrics[0]
    
    print("調校後最佳參數：lambdaParam:" + str(bestParameter[2]) +  
             "  ,結果AUC = " + str(bestParameter[0]))
    return bestParameter[3]

    
def  parametersEval(trainData, validationData):
    print("----- 評估lambda參數使用 ---------")
    evalParameter(trainData, validationData,"lambdaParam", 
            lambdaParamList=[1.0, 3.0, 5.0, 15.0, 25.0,30.0,35.0,40.0,45.0,50.0,60.0]) 
         


def CreateSparkContext():
    #sparkConf = SparkConf()                                                       \
    #                     .setAppName("RunNaiveBayesBinary")                         \
    #                     .set("spark.ui.showConsoleProgress", "false") 
    #sc = SparkContext(conf = sparkConf)
    print ("master="+sc.master)    
    SetLogger(sc)
    SetPath(sc)
    return (sc)

In [8]:
if __name__ == "__main__":
    print("RunNaiveBayesBinary")
    sc=CreateSparkContext()
    print("==========資料準備階段===============")
    (trainData, validationData, testData, categoriesMap) =PrepareData(sc)
    trainData.persist(); validationData.persist(); testData.persist()
    print("==========訓練評估階段===============")
    
    (AUC,duration,  lambdaParam,model)= \
            trainEvaluateModel(trainData, validationData, 60.0)
          
    if (len(sys.argv) == 2) and (sys.argv[1]=="-e"):
        parametersEval(trainData, validationData)
    elif   (len(sys.argv) == 2) and (sys.argv[1]=="-a"): 
        print("-----所有參數訓練評估找出最好的參數組合---------")  
        model=evalAllParameter(trainData, validationData, 
                           [1.0, 3.0, 5.0, 15.0, 25.0,30.0,35.0,40.0,45.0,50.0,60.0])

              
    print("==========測試階段===============")
    auc = evaluateModel(model, testData)
    print("使用test Data測試最佳模型,結果 AUC:" + str(auc))
    print("==========預測資料===============")
    PredictData(sc, model, categoriesMap)

RunNaiveBayesBinary
master=spark://spkma:7077
開始匯入資料...
共計：7395筆
標準化之前：
1.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.789131,
2.055555556,
0.676470588,
0.205882353,
0.047058824,
0.023529412,
0.443783175,
0.0,
0.0,
0.09077381,
0.0,
0.245831182,
0.003883495,
1.0,
1.0,
24.0,
0.0,
5424.0,
170.0,
8.0,
0.152941176,
0.079129575,

標準化之後：
3.08823447037,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
2.38210990583,
0.238469258742,
3.33017936032,
1.40301544772,
0.490307355432,
0.323968347813,
0.0777978299323,
0.0,
0.0,
2.19018963312,
0.0,
4.68369735582,
0.00206688652554,
2.05550839928,
2.11132763706,
1.1768686024,
0.0,
0.611125152837,
0.947253587774,
2.4743966777,
0.83444157063,
0.998721352144,
將資料分trainData:5958   validationData:734   testData:703
訓練評估：使用參數 lambda=60.0 所需時間=11.815093755722046 結果AUC = 0.6333377880731765
使用test Data測試最佳模型,結果 AUC:0.6521097560975609
開始匯入資料...
共計：3171筆
 網址：  http://www.lynnskitchenadventures.com/2009/04/homemade-enchilada-s