In [None]:
'''
使用决策树，随机森林，梯度提升树分类
'''

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("PySpark05").master("local[2]"). \
config("spark.sql.warehouse.dir","file:///E:/input/spark/warehouse").getOrCreate()

In [5]:
from pyspark.ml import Pipeline

In [6]:
# 分类：决策树，随机森林，梯度提升
from pyspark.ml.classification import RandomForestClassifier,GBTClassifier,DecisionTreeClassifier

In [35]:
# 回归：
from pyspark.ml.regression import DecisionTreeRegressor,RandomForestRegressor,GBTRegressor

In [9]:
data = spark.read.format('libsvm').load("file:///e:/Download/sample_libsvm_data.txt")

In [80]:
data.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 3 rows



In [10]:
data.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [54]:
train_data,test_data = data.randomSplit([0.7,0.3])

In [55]:
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(numTrees=100)
gbt = GBTClassifier()

In [56]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [57]:
dtc_preds = dtc_model.transform(test_data) # 这里是transform,会产生三个新的栏位'rawPrediction', 'probability', 'prediction'
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [58]:
train_data.columns

['label', 'features']

In [59]:
dtc_preds.columns

['label', 'features', 'rawPrediction', 'probability', 'prediction']

In [60]:
dtc_preds.show(3)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|    [0.0,1.0]|  [0.0,1.0]|       1.0|
|  0.0|(692,[123,124,125...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
|  0.0|(692,[124,125,126...|   [31.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 3 rows



In [61]:
rfc_preds.show(3)

+-----+--------------------+-------------+-----------+----------+
|label|            features|rawPrediction|probability|prediction|
+-----+--------------------+-------------+-----------+----------+
|  0.0|(692,[98,99,100,1...|   [98.0,2.0]|[0.98,0.02]|       0.0|
|  0.0|(692,[123,124,125...|   [97.0,3.0]|[0.97,0.03]|       0.0|
|  0.0|(692,[124,125,126...|  [100.0,0.0]|  [1.0,0.0]|       0.0|
+-----+--------------------+-------------+-----------+----------+
only showing top 3 rows



In [62]:
gbt_preds.show(3)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[-0.7378600704627...|[0.18607473723157...|       1.0|
|  0.0|(692,[123,124,125...|[1.54350200272498...|[0.95635347857270...|       0.0|
|  0.0|(692,[124,125,126...|[1.54350200272498...|[0.95635347857270...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



In [63]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [64]:
acc_eval = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='label',metricName='f1')

In [65]:
acc_eval.evaluate(dtc_preds)

0.9598711755233494

In [66]:
# 随机森林看起来不错
for pred in (dtc_preds,rfc_preds,gbt_preds):
    #print(pred)
    print(acc_eval.evaluate(pred))

0.9598711755233494
1.0
0.9598711755233494


In [67]:
acc_eval2 = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='label',metricName='accuracy')

In [85]:
for pred in (dtc_preds,rfc_preds,gbt_preds):
    #print(pred)
    #print(acc_eval2.evaluate(pred))
    print("model accuracy: %f" %(acc_eval2.evaluate(pred)))

model accuracy: 0.960000
model accuracy: 1.000000
model accuracy: 0.960000


In [69]:
# 参数重要性
rfc_model.featureImportances

SparseVector(692, {100: 0.0007, 101: 0.0004, 131: 0.0005, 155: 0.0002, 159: 0.0005, 180: 0.0005, 182: 0.0005, 183: 0.0001, 206: 0.0013, 207: 0.003, 208: 0.0008, 209: 0.006, 231: 0.0013, 233: 0.0005, 235: 0.0007, 241: 0.0009, 244: 0.0144, 245: 0.0048, 263: 0.017, 264: 0.0015, 265: 0.003, 267: 0.0005, 271: 0.0052, 272: 0.007, 273: 0.0005, 289: 0.0036, 290: 0.0011, 291: 0.0018, 293: 0.0005, 295: 0.0015, 297: 0.0003, 300: 0.0084, 301: 0.0144, 302: 0.0024, 316: 0.0005, 317: 0.0004, 318: 0.0005, 322: 0.0066, 323: 0.0237, 324: 0.0055, 327: 0.0049, 328: 0.0058, 329: 0.0123, 330: 0.0046, 331: 0.0025, 341: 0.0005, 342: 0.0014, 345: 0.0079, 350: 0.022, 351: 0.0414, 352: 0.0069, 353: 0.0012, 357: 0.0226, 359: 0.0063, 371: 0.0006, 373: 0.0016, 374: 0.0006, 378: 0.0581, 379: 0.0263, 380: 0.0002, 382: 0.0005, 385: 0.0165, 386: 0.0071, 387: 0.0051, 401: 0.0038, 402: 0.0005, 405: 0.0357, 406: 0.037, 407: 0.0089, 409: 0.0013, 410: 0.002, 415: 0.0006, 426: 0.0011, 429: 0.0083, 430: 0.0003, 433: 0.048, 43