In [2]:
'''
管道+模型选择
'''
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("PySpark_DataFrame").master("local[2]").\
config("spark.sql.warehouse.dir","file:///E:/input/spark/warehouse").getOrCreate()

In [44]:
from pyspark.ml import Pipeline # 管道
from pyspark.ml.classification import LogisticRegression # 逻辑回归算法
from pyspark.ml.feature import HashingTF,Tokenizer # 特征转换
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator,TrainValidationSplit # 参数和模型选择
from pyspark.ml.evaluation import BinaryClassificationEvaluator # 二分类评估

train = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0)
], ["id", "text", "label"])


In [8]:
tokenizer = Tokenizer(inputCol="text",outputCol='words') # 分词
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") # tf-idf词频
lr = LogisticRegression(maxIter=20,regParam=0.01)
pipeline = Pipeline(stages=[tokenizer,hashtingTF,lr])

In [42]:
#参数网格
paramGrid = ParamGridBuilder().\
    addGrid(hashingTF.numFeatures,[10,100,1000]).\
    addGrid(lr.regParam,[0.1,0.01]).build()

# 2折交叉校验选择数据(估计器，估计网格参数，评估器，n折)
crossval = CrossValidator(estimator=pipeline,estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),numFolds=2)

cvModel = crossval.fit(train) # 训练数据，最终使用最佳参数

In [41]:
#dir(cvModel)
pp = zip(cvModel.avgMetrics, paramGrid) # 评估metric和参数网格一一对应
cvModel.params
tf = cvModel.bestModel.stages[1].extractParamMap # 找到最佳模型下的
tf()
print(list(pp))

[(1.0, {Param(parent='HashingTF_497594a8bbb4b02a1826', name='numFeatures', doc='number of features.'): 10, Param(parent='LogisticRegression_4aa7b7a7857c802d97d7', name='regParam', doc='regularization parameter (>= 0).'): 0.1}), (0.8333333333333334, {Param(parent='HashingTF_497594a8bbb4b02a1826', name='numFeatures', doc='number of features.'): 10, Param(parent='LogisticRegression_4aa7b7a7857c802d97d7', name='regParam', doc='regularization parameter (>= 0).'): 0.01}), (1.0, {Param(parent='HashingTF_497594a8bbb4b02a1826', name='numFeatures', doc='number of features.'): 100, Param(parent='LogisticRegression_4aa7b7a7857c802d97d7', name='regParam', doc='regularization parameter (>= 0).'): 0.1}), (0.8333333333333334, {Param(parent='HashingTF_497594a8bbb4b02a1826', name='numFeatures', doc='number of features.'): 100, Param(parent='LogisticRegression_4aa7b7a7857c802d97d7', name='regParam', doc='regularization parameter (>= 0).'): 0.01}), (1.0, {Param(parent='HashingTF_497594a8bbb4b02a1826', nam

In [14]:
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [15]:
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

Row(id=4, text='spark i j k', probability=DenseVector([0.2665, 0.7335]), prediction=1.0)
Row(id=5, text='l m n', probability=DenseVector([0.9204, 0.0796]), prediction=0.0)
Row(id=6, text='mapreduce spark', probability=DenseVector([0.4438, 0.5562]), prediction=1.0)
Row(id=7, text='apache hadoop', probability=DenseVector([0.8587, 0.1413]), prediction=0.0)


In [None]:
# Train-validation split # 为了超参调节
from pyspark.ml.tuning import ParamGridBuilder,TrainValidationSplit

# dataframe实现分为训练集和测试集
data = spark.read.format("libsvm")\
    .load("data/mllib/sample_linear_regression_data.txt")
train, test = data.randomSplit([0.9, 0.1], seed=12345)

# 同上CrossValidator, 使用一个校验
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)