In [1]:
# -*- coding: UTF-8 -*-
import sys
from time import time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pyspark import SparkConf, SparkContext
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics



In [2]:
sparkConf = SparkConf().setAppName("DecisionTreeBinary_Test")
sc = SparkContext(conf = sparkConf)
sc.master

'local[*]'

In [3]:
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

In [4]:
path = 'file:///mnt/f/AI/Book/other/spark_tmp/PythonProject/'
data = sc.textFile(path+'data/train.tsv')
data.cache()

file:///mnt/f/AI/Book/other/spark_tmp/PythonProject/data/train.tsv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

user id \t movie id \t rating \t time

In [5]:
header = data.first()
header

'"url"\t"urlid"\t"boilerplate"\t"alchemy_category"\t"alchemy_category_score"\t"avglinksize"\t"commonlinkratio_1"\t"commonlinkratio_2"\t"commonlinkratio_3"\t"commonlinkratio_4"\t"compression_ratio"\t"embed_ratio"\t"framebased"\t"frameTagRatio"\t"hasDomainLink"\t"html_ratio"\t"image_ratio"\t"is_news"\t"lengthyLinkDomain"\t"linkwordscore"\t"news_front_page"\t"non_markup_alphanum_characters"\t"numberOfLinks"\t"numwords_in_url"\t"parametrizedLinkRatio"\t"spelling_errors_ratio"\t"label"'

In [6]:
rowdata = data.filter(lambda x:x!=header)
rowdata.first()

'"http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html"\t"4042"\t"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest 

In [27]:
data_train = rowdata.map(lambda x:x.replace('"','')
                        ).map(lambda x:x.replace('?','0'))
data_train.first()

'http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html\t4042\t{title:IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries,body:A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest provider of compute

In [28]:
data_train = data_train.map(lambda x:x.split('\t')[3:])
data_train.first()

['business',
 '0.789131',
 '2.055555556',
 '0.676470588',
 '0.205882353',
 '0.047058824',
 '0.023529412',
 '0.443783175',
 '0',
 '0',
 '0.09077381',
 '0',
 '0.245831182',
 '0.003883495',
 '1',
 '1',
 '24',
 '0',
 '5424',
 '170',
 '8',
 '0.152941176',
 '0.079129575',
 '0']

In [29]:
data_train.count()

7395

In [10]:
catgories_list = data_train.map(lambda x:x[0]).distinct().collect()
catgories_dict = {k:i for i,k in enumerate(catgories_list)}
catgories_dict

{'0': 7,
 'arts_entertainment': 0,
 'business': 6,
 'computer_internet': 1,
 'culture_politics': 10,
 'gaming': 9,
 'health': 13,
 'law_crime': 8,
 'recreation': 12,
 'religion': 4,
 'science_technology': 11,
 'sports': 2,
 'unknown': 5,
 'weather': 3}

In [12]:
def extract_features(data, _id, dict_,):
    data_tmp = np.zeros((14))
    data_tmp[dict_[data.pop(_id)]] = 1
    return np.concatenate((data_tmp, np.array(data),))

In [13]:
data_train = data_train.map(lambda x:extract_features(x,0,catgories_dict))
data_train.first()

array(['0.0', '0.0', '0.0', '0.0', '0.0', '0.0', '1.0', '0.0', '0.0',
       '0.0', '0.0', '0.0', '0.0', '0.0', '0.789131', '2.055555556',
       '0.676470588', '0.205882353', '0.047058824', '0.023529412',
       '0.443783175', '0', '0', '0.09077381', '0', '0.245831182',
       '0.003883495', '1', '1', '24', '0', '5424', '170', '8',
       '0.152941176', '0.079129575', '0'], 
      dtype='<U32')

In [14]:
data_train = data_train.map(lambda x:list(map(float, x,)))
data_train.first()

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.789131,
 2.055555556,
 0.676470588,
 0.205882353,
 0.047058824,
 0.023529412,
 0.443783175,
 0.0,
 0.0,
 0.09077381,
 0.0,
 0.245831182,
 0.003883495,
 1.0,
 1.0,
 24.0,
 0.0,
 5424.0,
 170.0,
 8.0,
 0.152941176,
 0.079129575,
 0.0]

In [15]:
data_train = data_train.map(lambda x:LabeledPoint(x[-1],x[:-1]))
data_train.first()

LabeledPoint(0.0, [0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])

In [16]:
data_train.cache()

PythonRDD[15] at RDD at PythonRDD.scala:49

In [17]:
model = DecisionTree.trainClassifier(data_train, 2, {}, 'entropy', 6, 16)

In [18]:
pre = model.predict(data_train.map(lambda x:x.features))

In [19]:
scoreAndLabels = pre.zip(data_train.map(lambda p: p.label))

In [20]:
scoreAndLabels.first()

(1.0, 0.0)

In [21]:
metrics = BinaryClassificationMetrics(scoreAndLabels)

In [22]:
metrics.areaUnderROC

0.6724748064018485

In [23]:
metrics.areaUnderPR

0.6513146335472668

In [24]:
model.save(sc, path+'DecisionTreeBinary')

In [26]:
print(model.toDebugString())

DecisionTreeModel classifier of depth 6 with 119 nodes
  If (feature 31 <= 1577.5)
   If (feature 0 <= 0.5)
    If (feature 13 <= 0.5)
     If (feature 31 <= 1126.5)
      If (feature 23 <= 0.0325512595)
       If (feature 16 <= 0.7683272720000001)
        Predict: 0.0
       Else (feature 16 > 0.7683272720000001)
        Predict: 1.0
      Else (feature 23 > 0.0325512595)
       If (feature 15 <= 1.8682764605000002)
        Predict: 0.0
       Else (feature 15 > 1.8682764605000002)
        Predict: 0.0
     Else (feature 31 > 1126.5)
      If (feature 12 <= 0.5)
       If (feature 16 <= 0.578335731)
        Predict: 0.0
       Else (feature 16 > 0.578335731)
        Predict: 1.0
      Else (feature 12 > 0.5)
       If (feature 23 <= 0.036760416000000004)
        Predict: 1.0
       Else (feature 23 > 0.036760416000000004)
        Predict: 0.0
    Else (feature 13 > 0.5)
     If (feature 20 <= 0.641901116)
      If (feature 35 <= 0.061204979)
       If (feature 16 <= 0.4494026455)
    

In [None]:
# 不知道如何导入模型