In [1]:
# -*- coding: UTF-8 -*-
import sys
from time import time
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

# 初始化sc和logger

In [2]:
sparkConf = SparkConf().setAppName("PipeLine_Test")
sc = SparkContext(conf = sparkConf)
sc.master

'local[*]'

In [3]:
sqlContext = SparkSession.builder.getOrCreate()

In [4]:
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

# 导入csv文件并查看基本信息

In [5]:
path = 'file:///mnt/f/AI/Book/other/spark_tmp/PythonProject/'
data = sqlContext.read.csv(path+'data/train.tsv', header=True, sep='\t')
data.cache()

DataFrame[url: string, urlid: string, boilerplate: string, alchemy_category: string, alchemy_category_score: string, avglinksize: string, commonlinkratio_1: string, commonlinkratio_2: string, commonlinkratio_3: string, commonlinkratio_4: string, compression_ratio: string, embed_ratio: string, framebased: string, frameTagRatio: string, hasDomainLink: string, html_ratio: string, image_ratio: string, is_news: string, lengthyLinkDomain: string, linkwordscore: string, news_front_page: string, non_markup_alphanum_characters: string, numberOfLinks: string, numwords_in_url: string, parametrizedLinkRatio: string, spelling_errors_ratio: string, label: string]

In [6]:
# 模式信息
data.printSchema()

root
 |-- url: string (nullable = true)
 |-- urlid: string (nullable = true)
 |-- boilerplate: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: string (nullable = true)
 |-- avglinksize: string (nullable = true)
 |-- commonlinkratio_1: string (nullable = true)
 |-- commonlinkratio_2: string (nullable = true)
 |-- commonlinkratio_3: string (nullable = true)
 |-- commonlinkratio_4: string (nullable = true)
 |-- compression_ratio: string (nullable = true)
 |-- embed_ratio: string (nullable = true)
 |-- framebased: string (nullable = true)
 |-- frameTagRatio: string (nullable = true)
 |-- hasDomainLink: string (nullable = true)
 |-- html_ratio: string (nullable = true)
 |-- image_ratio: string (nullable = true)
 |-- is_news: string (nullable = true)
 |-- lengthyLinkDomain: string (nullable = true)
 |-- linkwordscore: string (nullable = true)
 |-- news_front_page: string (nullable = true)
 |-- non_markup_alphanum_characters: string (nulla

In [7]:
# 总数据量
data.count()

7395

In [8]:
# 第一条数据
data.first()

Row(url='http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html', urlid='4042', boilerplate='"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five year

user id \t movie id \t rating \t time

In [9]:
# 对 部分特征 进行替换操作, '?' -> '0'
data = data.replace('?','0', ['alchemy_category_score', 
                              'is_news', 'news_front_page', ])

In [10]:
# 生成 数值型 和 类别型 特征名称
header = np.array(data.columns[3:])
num_ids = [1,2,3,4,5,6,7,8, 10, 12,13, 16, 18,19, 21,22]
num_list = header[num_ids]
cat_ids = [0,9,11,14,15,17,20]
cat_list = header[cat_ids]

In [11]:
cat_list

array(['alchemy_category', 'framebased', 'hasDomainLink', 'is_news',
       'lengthyLinkDomain', 'news_front_page', 'numwords_in_url'], 
      dtype='<U30')

In [12]:
# 数据类型转换 string -> double
train_data = data.select(data.alchemy_category_score.cast('Double'),
                        data.avglinksize.cast('Double'),
                        data.commonlinkratio_1.cast('Double'),
                        data.commonlinkratio_2.cast('Double'),
                        data.commonlinkratio_3.cast('Double'),
                        data.commonlinkratio_4.cast('Double'),
                        data.compression_ratio.cast('Double'),
                        data.embed_ratio.cast('Double'),
                        data.frameTagRatio.cast('Double'),
                        data.html_ratio.cast('Double'),
                        data.image_ratio.cast('Double'),
                        data.linkwordscore.cast('Double'),
                        data.non_markup_alphanum_characters.cast('Double'),
                        data.numberOfLinks.cast('Double'),
                        data.parametrizedLinkRatio.cast('Double'),
                        data.spelling_errors_ratio.cast('Double'),
                        data.label.cast('Double'),
                        
                         'framebased', 
                         'hasDomainLink', 
                         'is_news',
                         'lengthyLinkDomain', 
                         'news_front_page', 
                         'numwords_in_url',
                         'alchemy_category', 
                        )
train_data.dtypes

[('alchemy_category_score', 'double'),
 ('avglinksize', 'double'),
 ('commonlinkratio_1', 'double'),
 ('commonlinkratio_2', 'double'),
 ('commonlinkratio_3', 'double'),
 ('commonlinkratio_4', 'double'),
 ('compression_ratio', 'double'),
 ('embed_ratio', 'double'),
 ('frameTagRatio', 'double'),
 ('html_ratio', 'double'),
 ('image_ratio', 'double'),
 ('linkwordscore', 'double'),
 ('non_markup_alphanum_characters', 'double'),
 ('numberOfLinks', 'double'),
 ('parametrizedLinkRatio', 'double'),
 ('spelling_errors_ratio', 'double'),
 ('label', 'double'),
 ('framebased', 'string'),
 ('hasDomainLink', 'string'),
 ('is_news', 'string'),
 ('lengthyLinkDomain', 'string'),
 ('news_front_page', 'string'),
 ('numwords_in_url', 'string'),
 ('alchemy_category', 'string')]

In [13]:
# 类别特征 数值化
for feature_name in cat_list:
    cat_indexer = StringIndexer(inputCol=feature_name, 
                                outputCol=feature_name+'_indexer')
    train_data_trafor = cat_indexer.fit(train_data)
    train_data = train_data_trafor.transform(train_data)
    
train_data.dtypes

[('alchemy_category_score', 'double'),
 ('avglinksize', 'double'),
 ('commonlinkratio_1', 'double'),
 ('commonlinkratio_2', 'double'),
 ('commonlinkratio_3', 'double'),
 ('commonlinkratio_4', 'double'),
 ('compression_ratio', 'double'),
 ('embed_ratio', 'double'),
 ('frameTagRatio', 'double'),
 ('html_ratio', 'double'),
 ('image_ratio', 'double'),
 ('linkwordscore', 'double'),
 ('non_markup_alphanum_characters', 'double'),
 ('numberOfLinks', 'double'),
 ('parametrizedLinkRatio', 'double'),
 ('spelling_errors_ratio', 'double'),
 ('label', 'double'),
 ('framebased', 'string'),
 ('hasDomainLink', 'string'),
 ('is_news', 'string'),
 ('lengthyLinkDomain', 'string'),
 ('news_front_page', 'string'),
 ('numwords_in_url', 'string'),
 ('alchemy_category', 'string'),
 ('alchemy_category_indexer', 'double'),
 ('framebased_indexer', 'double'),
 ('hasDomainLink_indexer', 'double'),
 ('is_news_indexer', 'double'),
 ('lengthyLinkDomain_indexer', 'double'),
 ('news_front_page_indexer', 'double'),
 ('nu

In [14]:
# onehot 数值化后的类别特征
for feature_name in cat_list:
    onehot_encoder = OneHotEncoder(dropLast=False, 
                                   inputCol=feature_name+'_indexer', 
                                   outputCol=feature_name+'_onehoted')
    train_data = onehot_encoder.transform(train_data)

train_data.dtypes

[('alchemy_category_score', 'double'),
 ('avglinksize', 'double'),
 ('commonlinkratio_1', 'double'),
 ('commonlinkratio_2', 'double'),
 ('commonlinkratio_3', 'double'),
 ('commonlinkratio_4', 'double'),
 ('compression_ratio', 'double'),
 ('embed_ratio', 'double'),
 ('frameTagRatio', 'double'),
 ('html_ratio', 'double'),
 ('image_ratio', 'double'),
 ('linkwordscore', 'double'),
 ('non_markup_alphanum_characters', 'double'),
 ('numberOfLinks', 'double'),
 ('parametrizedLinkRatio', 'double'),
 ('spelling_errors_ratio', 'double'),
 ('label', 'double'),
 ('framebased', 'string'),
 ('hasDomainLink', 'string'),
 ('is_news', 'string'),
 ('lengthyLinkDomain', 'string'),
 ('news_front_page', 'string'),
 ('numwords_in_url', 'string'),
 ('alchemy_category', 'string'),
 ('alchemy_category_indexer', 'double'),
 ('framebased_indexer', 'double'),
 ('hasDomainLink_indexer', 'double'),
 ('is_news_indexer', 'double'),
 ('lengthyLinkDomain_indexer', 'double'),
 ('news_front_page_indexer', 'double'),
 ('nu

# 生成训练数据并开始训练

In [15]:
# 挑选最终处理后的数据 生成 训练数据, 
# 并重命名为 'features', 数据类型为 'vector'
assembler_input = train_data.columns[:16] + train_data.columns[-7:]

assembler = VectorAssembler(inputCols=assembler_input, 
                            outputCol='features')

train_data = assembler.transform(train_data)

train_data.dtypes

[('alchemy_category_score', 'double'),
 ('avglinksize', 'double'),
 ('commonlinkratio_1', 'double'),
 ('commonlinkratio_2', 'double'),
 ('commonlinkratio_3', 'double'),
 ('commonlinkratio_4', 'double'),
 ('compression_ratio', 'double'),
 ('embed_ratio', 'double'),
 ('frameTagRatio', 'double'),
 ('html_ratio', 'double'),
 ('image_ratio', 'double'),
 ('linkwordscore', 'double'),
 ('non_markup_alphanum_characters', 'double'),
 ('numberOfLinks', 'double'),
 ('parametrizedLinkRatio', 'double'),
 ('spelling_errors_ratio', 'double'),
 ('label', 'double'),
 ('framebased', 'string'),
 ('hasDomainLink', 'string'),
 ('is_news', 'string'),
 ('lengthyLinkDomain', 'string'),
 ('news_front_page', 'string'),
 ('numwords_in_url', 'string'),
 ('alchemy_category', 'string'),
 ('alchemy_category_indexer', 'double'),
 ('framebased_indexer', 'double'),
 ('hasDomainLink_indexer', 'double'),
 ('is_news_indexer', 'double'),
 ('lengthyLinkDomain_indexer', 'double'),
 ('news_front_page_indexer', 'double'),
 ('nu

data_train = train_data.select('features')
data_train.take(1)

In [16]:
# 生成模型
model = DecisionTreeClassifier(featuresCol='features', 
                               labelCol='label')



In [17]:
# 训练模型
model = model.fit(train_data)


In [18]:
# 进行预测 
pre = model.transform(train_data)

In [19]:
pre.take(1)

[Row(alchemy_category_score=0.789131, avglinksize=2.055555556, commonlinkratio_1=0.676470588, commonlinkratio_2=0.205882353, commonlinkratio_3=0.047058824, commonlinkratio_4=0.023529412, compression_ratio=0.443783175, embed_ratio=0.0, frameTagRatio=0.09077381, html_ratio=0.245831182, image_ratio=0.003883495, linkwordscore=24.0, non_markup_alphanum_characters=5424.0, numberOfLinks=170.0, parametrizedLinkRatio=0.152941176, spelling_errors_ratio=0.079129575, label=0.0, framebased='0', hasDomainLink='0', is_news='1', lengthyLinkDomain='1', news_front_page='0', numwords_in_url='8', alchemy_category='business', alchemy_category_indexer=3.0, framebased_indexer=0.0, hasDomainLink_indexer=0.0, is_news_indexer=0.0, lengthyLinkDomain_indexer=0.0, news_front_page_indexer=0.0, numwords_in_url_indexer=7.0, alchemy_category_onehoted=SparseVector(14, {3: 1.0}), framebased_onehoted=SparseVector(1, {0: 1.0}), hasDomainLink_onehoted=SparseVector(2, {0: 1.0}), is_news_onehoted=SparseVector(2, {0: 1.0}), l

# PipeLine模型

In [20]:
# 生成PipeLine专用数据
train_pipeline = data.select(data.alchemy_category_score.cast('Double'),
                        data.avglinksize.cast('Double'),
                        data.commonlinkratio_1.cast('Double'),
                        data.commonlinkratio_2.cast('Double'),
                        data.commonlinkratio_3.cast('Double'),
                        data.commonlinkratio_4.cast('Double'),
                        data.compression_ratio.cast('Double'),
                        data.embed_ratio.cast('Double'),
                        data.frameTagRatio.cast('Double'),
                        data.html_ratio.cast('Double'),
                        data.image_ratio.cast('Double'),
                        data.linkwordscore.cast('Double'),
                        data.non_markup_alphanum_characters.cast('Double'),
                        data.numberOfLinks.cast('Double'),
                        data.parametrizedLinkRatio.cast('Double'),
                        data.spelling_errors_ratio.cast('Double'),
                        data.label.cast('Double'),
                        
                         'framebased', 
                         'hasDomainLink', 
                         'is_news',
                         'lengthyLinkDomain', 
                         'news_front_page', 
                         'numwords_in_url',
                         'alchemy_category', 
                        )
train_pipeline.dtypes

[('alchemy_category_score', 'double'),
 ('avglinksize', 'double'),
 ('commonlinkratio_1', 'double'),
 ('commonlinkratio_2', 'double'),
 ('commonlinkratio_3', 'double'),
 ('commonlinkratio_4', 'double'),
 ('compression_ratio', 'double'),
 ('embed_ratio', 'double'),
 ('frameTagRatio', 'double'),
 ('html_ratio', 'double'),
 ('image_ratio', 'double'),
 ('linkwordscore', 'double'),
 ('non_markup_alphanum_characters', 'double'),
 ('numberOfLinks', 'double'),
 ('parametrizedLinkRatio', 'double'),
 ('spelling_errors_ratio', 'double'),
 ('label', 'double'),
 ('framebased', 'string'),
 ('hasDomainLink', 'string'),
 ('is_news', 'string'),
 ('lengthyLinkDomain', 'string'),
 ('news_front_page', 'string'),
 ('numwords_in_url', 'string'),
 ('alchemy_category', 'string')]

In [21]:
# PipeLine用到的所有模型

cat_indexer = [StringIndexer(inputCol=feature_name, 
                             outputCol=feature_name+'_indexer') 
               for feature_name in cat_list]

onehot_encoder = [OneHotEncoder(dropLast=False, 
                               inputCol=feature_name+'_indexer', 
                               outputCol=feature_name+'_onehoted')
                  for feature_name in cat_list]

assembler_input = ['alchemy_category_score',
                     'avglinksize',
                     'commonlinkratio_1',
                     'commonlinkratio_2',
                     'commonlinkratio_3',
                     'commonlinkratio_4',
                     'compression_ratio',
                     'embed_ratio',
                     'frameTagRatio',
                     'html_ratio',
                     'image_ratio',
                     'linkwordscore',
                     'non_markup_alphanum_characters',
                     'numberOfLinks',
                     'parametrizedLinkRatio',
                     'spelling_errors_ratio',
                     'alchemy_category_onehoted',
                     'framebased_onehoted',
                     'hasDomainLink_onehoted',
                     'is_news_onehoted',
                     'lengthyLinkDomain_onehoted',
                     'news_front_page_onehoted',
                     'numwords_in_url_onehoted',]

assembler = VectorAssembler(inputCols=assembler_input, 
                            outputCol='features')

model = DecisionTreeClassifier(featuresCol='features', 
                               labelCol='label')

params  = ParamGridBuilder().addGrid(model.impurity,['gini', 'entropy']
                           ).addGrid(model.maxDepth,[5,10,15]
                           ).addGrid(model.maxBins,[10,15,20]
                           ).build()

evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', 
                                          labelCol='label', 
                                          metricName='areaUnderROC')

cv_model = CrossValidator(estimator=model, 
                          evaluator=evaluator, 
                          estimatorParamMaps=params, 
                          numFolds=3)


In [22]:
# 生成PipeLine模型
pipeline = Pipeline(stages=cat_indexer+onehot_encoder+[assembler, model])

In [23]:
# 查看流程
pipeline.getStages()

[StringIndexer_430ab984b7ef6aa2b3b2,
 StringIndexer_43588b1daa763a1cea06,
 StringIndexer_4adba765538d7d76c28a,
 StringIndexer_4907a28eec2ace0ddac2,
 StringIndexer_44df98ae795bbe35370d,
 StringIndexer_41d7adf1d73ee523203a,
 StringIndexer_46999ea19b13434b772e,
 OneHotEncoder_48529778db27e384f510,
 OneHotEncoder_466eae27cc743b02db30,
 OneHotEncoder_41b4ab0a5b3b04ce739a,
 OneHotEncoder_488d935061a316d20527,
 OneHotEncoder_40878877af29ce65f54e,
 OneHotEncoder_471aaab2b625d304fa9f,
 OneHotEncoder_474eabd1abf96df9dfaa,
 VectorAssembler_4885b797714af9c99ebc,
 DecisionTreeClassifier_487e94a84d352873f652]

In [24]:
# 训练pipeline_model
pipeline_model = pipeline.fit(train_pipeline)

In [25]:
# 查看pipeline_model的 各个阶段
pipeline_model.stages

[StringIndexer_430ab984b7ef6aa2b3b2,
 StringIndexer_43588b1daa763a1cea06,
 StringIndexer_4adba765538d7d76c28a,
 StringIndexer_4907a28eec2ace0ddac2,
 StringIndexer_44df98ae795bbe35370d,
 StringIndexer_41d7adf1d73ee523203a,
 StringIndexer_46999ea19b13434b772e,
 OneHotEncoder_48529778db27e384f510,
 OneHotEncoder_466eae27cc743b02db30,
 OneHotEncoder_41b4ab0a5b3b04ce739a,
 OneHotEncoder_488d935061a316d20527,
 OneHotEncoder_40878877af29ce65f54e,
 OneHotEncoder_471aaab2b625d304fa9f,
 OneHotEncoder_474eabd1abf96df9dfaa,
 VectorAssembler_4885b797714af9c99ebc,
 DecisionTreeClassificationModel (uid=DecisionTreeClassifier_487e94a84d352873f652) of depth 5 with 61 nodes]

In [26]:
# 查看决策树模型的各个决策点
print(pipeline_model.stages[-1].toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_487e94a84d352873f652) of depth 5 with 61 nodes
  If (feature 12 <= 1577.5)
   If (feature 18 in {1.0})
    If (feature 10 <= 0.33354088800000004)
     If (feature 9 <= 0.16549340899999998)
      If (feature 0 <= 0.5939270000000001)
       Predict: 1.0
      Else (feature 0 > 0.5939270000000001)
       Predict: 0.0
     Else (feature 9 > 0.16549340899999998)
      If (feature 15 <= 0.12702797500000002)
       Predict: 0.0
      Else (feature 15 > 0.12702797500000002)
       Predict: 0.0
    Else (feature 10 > 0.33354088800000004)
     If (feature 9 <= 0.3400148705)
      If (feature 3 <= 0.042781972)
       Predict: 0.0
      Else (feature 3 > 0.042781972)
       Predict: 0.0
     Else (feature 9 > 0.3400148705)
      Predict: 1.0
   Else (feature 18 not in {1.0})
    If (feature 20 in {1.0})
     If (feature 6 <= 0.641901116)
      If (feature 15 <= 0.056647563)
       Predict: 0.0
      Else (feature 15 > 0.056647563)
       

In [27]:
# 生成预测值
predict = pipeline_model.transform(train_pipeline)

In [28]:
predict.columns

['alchemy_category_score',
 'avglinksize',
 'commonlinkratio_1',
 'commonlinkratio_2',
 'commonlinkratio_3',
 'commonlinkratio_4',
 'compression_ratio',
 'embed_ratio',
 'frameTagRatio',
 'html_ratio',
 'image_ratio',
 'linkwordscore',
 'non_markup_alphanum_characters',
 'numberOfLinks',
 'parametrizedLinkRatio',
 'spelling_errors_ratio',
 'label',
 'framebased',
 'hasDomainLink',
 'is_news',
 'lengthyLinkDomain',
 'news_front_page',
 'numwords_in_url',
 'alchemy_category',
 'alchemy_category_indexer',
 'framebased_indexer',
 'hasDomainLink_indexer',
 'is_news_indexer',
 'lengthyLinkDomain_indexer',
 'news_front_page_indexer',
 'numwords_in_url_indexer',
 'alchemy_category_onehoted',
 'framebased_onehoted',
 'hasDomainLink_onehoted',
 'is_news_onehoted',
 'lengthyLinkDomain_onehoted',
 'news_front_page_onehoted',
 'numwords_in_url_onehoted',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [29]:
# 查看经过PipeLine model训练转换之后的 数据
predict.take(1)

[Row(alchemy_category_score=0.789131, avglinksize=2.055555556, commonlinkratio_1=0.676470588, commonlinkratio_2=0.205882353, commonlinkratio_3=0.047058824, commonlinkratio_4=0.023529412, compression_ratio=0.443783175, embed_ratio=0.0, frameTagRatio=0.09077381, html_ratio=0.245831182, image_ratio=0.003883495, linkwordscore=24.0, non_markup_alphanum_characters=5424.0, numberOfLinks=170.0, parametrizedLinkRatio=0.152941176, spelling_errors_ratio=0.079129575, label=0.0, framebased='0', hasDomainLink='0', is_news='1', lengthyLinkDomain='1', news_front_page='0', numwords_in_url='8', alchemy_category='business', alchemy_category_indexer=3.0, framebased_indexer=0.0, hasDomainLink_indexer=0.0, is_news_indexer=0.0, lengthyLinkDomain_indexer=0.0, news_front_page_indexer=0.0, numwords_in_url_indexer=7.0, alchemy_category_onehoted=SparseVector(14, {3: 1.0}), framebased_onehoted=SparseVector(1, {0: 1.0}), hasDomainLink_onehoted=SparseVector(2, {0: 1.0}), is_news_onehoted=SparseVector(2, {0: 1.0}), l

In [30]:
# 查看生成的特征及取值
predict.select('rawPrediction', 'probability', 'prediction').show(5)

+--------------+--------------------+----------+
| rawPrediction|         probability|prediction|
+--------------+--------------------+----------+
| [130.0,527.0]|[0.19786910197869...|       1.0|
|  [52.0,113.0]|[0.31515151515151...|       1.0|
| [843.0,596.0]|[0.58582348853370...|       0.0|
|[924.0,1268.0]|[0.42153284671532...|       1.0|
|   [78.0,18.0]|     [0.8125,0.1875]|       0.0|
+--------------+--------------------+----------+
only showing top 5 rows



In [31]:
# 生成评价数据
prediction = evaluator.evaluate(predict)
prediction

0.6646208289915445

In [32]:
# 生成cv_pipeline
cv_pipeline =  Pipeline(stages=cat_indexer+onehot_encoder+[assembler, cv_model])

In [33]:
# 训练cv_pipeline
cv_pipeline_model = cv_pipeline.fit(train_pipeline)

In [34]:
# 查看cv_pipeline得到的最好的模型
cv_pipeline_model.stages[-1].bestModel

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_487e94a84d352873f652) of depth 5 with 59 nodes

In [35]:
# 评价cv_pipeline
cv_predict = cv_pipeline_model.transform(train_pipeline)
cv_auc = evaluator.evaluate(cv_predict)
cv_auc

0.6651549092638133

In [36]:
sc.stop()