In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName("PySpark_DataFrame").master("local[2]").\
config("spark.sql.warehouse.dir","file:///E:/input/spark/warehouse").getOrCreate()

In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Tokenizer,HashingTF
from pyspark.ml import Pipeline,PipelineModel

# 机器学习dataframe格式数据标准栏位名label + features
train = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])


  return f(*args, **kwds)
  return f(*args, **kwds)


In [7]:
train.collect()

[Row(label=1.0, features=DenseVector([0.0, 1.1, 0.1])),
 Row(label=0.0, features=DenseVector([2.0, 1.0, -1.0])),
 Row(label=0.0, features=DenseVector([2.0, 1.3, 1.0])),
 Row(label=1.0, features=DenseVector([0.0, 1.2, -0.5]))]

In [8]:
lr = LogisticRegression(maxIter=0,regParam=0.01) # 创建一个estimator实例
#print(lr.explainParams()) # 参数解释

In [9]:
model1 = lr.fit(train)
paramMap={lr.maxIter:20,lr.regParam:0.1} # 参数字典

In [10]:
#dir(model1)

In [11]:
model2 = lr.fit(train,paramMap) # fit
pred = model2.transform(test) # 在spark.ml中使用transform获得结果

In [12]:
pred.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [13]:
#pred = model2.transform(test)
result = pred.select("features","label","probability","prediction").collect()
for row in result:
    print(row.features, row.label,row.probability, row.prediction)

[-1.0,1.5,1.3] 1.0 [0.05668429360932413,0.9433157063906759] 1.0
[3.0,2.0,-0.1] 0.0 [0.9232337866421291,0.07676621335787098] 0.0
[0.0,2.2,-1.5] 1.0 [0.11040665017928436,0.8895933498207156] 1.0


In [14]:
# 使用pipeline

In [15]:
train2 = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop a", 0.0),
    (4, "spark mapreduce", 1.0),
    (5, "hadoop apache", 0.0),
    (6, "a b c d e spark", 1.0),
    (7, "hadoop mapreduce", 0.0),
    (8, "a b c d e spark", 1.0),
    (9, "c mapreduce", 0.0),
    (10, "b mapreduce", 0.0),
    (11, "d mapreduce", 0.0),
    (12, "a b c d e spark", 1.0)
], ["id", "text", "label"])

test2 = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

In [16]:
tokenizer = Tokenizer(inputCol="text",outputCol='words')
hashtingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=20,regParam=0.01)
pipeline = Pipeline(stages=[tokenizer,hashtingTF,lr])

In [17]:
tmp1 = tokenizer.transform(train2) # 增加了words新栏位，

In [18]:
tmp1.collect()

[Row(id=0, text='a b c d e spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark']),
 Row(id=1, text='b d', label=0.0, words=['b', 'd']),
 Row(id=2, text='spark f g h', label=1.0, words=['spark', 'f', 'g', 'h']),
 Row(id=3, text='hadoop a', label=0.0, words=['hadoop', 'a']),
 Row(id=4, text='spark mapreduce', label=1.0, words=['spark', 'mapreduce']),
 Row(id=5, text='hadoop apache', label=0.0, words=['hadoop', 'apache']),
 Row(id=6, text='a b c d e spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark']),
 Row(id=7, text='hadoop mapreduce', label=0.0, words=['hadoop', 'mapreduce']),
 Row(id=8, text='a b c d e spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark']),
 Row(id=9, text='c mapreduce', label=0.0, words=['c', 'mapreduce']),
 Row(id=10, text='b mapreduce', label=0.0, words=['b', 'mapreduce']),
 Row(id=11, text='d mapreduce', label=0.0, words=['d', 'mapreduce']),
 Row(id=12, text='a b c d e spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark'])]

In [19]:
tmp2 = HashingTF(inputCol='words', outputCol="features").transform(tmp1) #增加了特征新栏位
tmp2.collect()[0],tmp2.collect()

(Row(id=0, text='a b c d e spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark'], features=SparseVector(262144, {17222: 1.0, 27526: 1.0, 28698: 1.0, 30913: 1.0, 227410: 1.0, 234657: 1.0})),
 [Row(id=0, text='a b c d e spark', label=1.0, words=['a', 'b', 'c', 'd', 'e', 'spark'], features=SparseVector(262144, {17222: 1.0, 27526: 1.0, 28698: 1.0, 30913: 1.0, 227410: 1.0, 234657: 1.0})),
  Row(id=1, text='b d', label=0.0, words=['b', 'd'], features=SparseVector(262144, {27526: 1.0, 30913: 1.0})),
  Row(id=2, text='spark f g h', label=1.0, words=['spark', 'f', 'g', 'h'], features=SparseVector(262144, {15554: 1.0, 24152: 1.0, 51505: 1.0, 234657: 1.0})),
  Row(id=3, text='hadoop a', label=0.0, words=['hadoop', 'a'], features=SparseVector(262144, {155117: 1.0, 227410: 1.0})),
  Row(id=4, text='spark mapreduce', label=1.0, words=['spark', 'mapreduce'], features=SparseVector(262144, {42633: 1.0, 234657: 1.0})),
  Row(id=5, text='hadoop apache', label=0.0, words=['hadoop', 'apache'], featur

In [20]:
print(tmp2)

DataFrame[id: bigint, text: string, label: double, words: array<string>, features: vector]


In [21]:
model = pipeline.fit(train2)
pred = model.transform(test2)
selected = pred.select('id','text','probability','prediction')
selected.collect()

[Row(id=4, text='spark i j k', probability=DenseVector([0.1089, 0.8911]), prediction=1.0),
 Row(id=5, text='l m n', probability=DenseVector([0.9541, 0.0459]), prediction=0.0),
 Row(id=6, text='spark hadoop spark', probability=DenseVector([0.0033, 0.9967]), prediction=1.0),
 Row(id=7, text='apache hadoop', probability=DenseVector([0.9943, 0.0057]), prediction=0.0)]

In [47]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import Tokenizer,HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator,TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#管道
tokenizer = Tokenizer(inputCol="text",outputCol='words')
hashtingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
#lr = LogisticRegression(maxIter=20,regParam=0.01)
lr = LogisticRegression(maxIter=20)
pipeline = Pipeline(stages=[tokenizer,hashtingTF,lr])

#参数网格
paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam,[0.1, 0.5, 0.7, 0.9]).build()
#效果评估
binEvaluator = BinaryClassificationEvaluator(labelCol="label", 
                    rawPredictionCol="prediction",metricName="areaUnderROC")

# 模型选择/网格搜索
tvs = TrainValidationSplit(estimator=pipeline,estimatorParamMaps=paramGrid,
                           evaluator=binEvaluator,trainRatio=0.9)
#模型训练
model =tvs.fit(train2)

In [42]:
# 模型训练的评估
model.validationMetrics # 对应参数的得分，
#如果是cross交叉训练
#model.avgMetrics
#list(zip(model.validationMetrics, paramGrid))

[(0.5,
  {Param(parent='LogisticRegression_41b6b49e9630bcedf44f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1}),
 (0.5,
  {Param(parent='LogisticRegression_41b6b49e9630bcedf44f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5}),
 (0.5,
  {Param(parent='LogisticRegression_41b6b49e9630bcedf44f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.7}),
 (0.5,
  {Param(parent='LogisticRegression_41b6b49e9630bcedf44f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.9})]

In [40]:
# 模型预测
preds = model.transform(train2)
preds

DataFrame[id: bigint, text: string, label: double, words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [46]:
paramGrid

[{Param(parent='LogisticRegression_41b6b49e9630bcedf44f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1},
 {Param(parent='LogisticRegression_41b6b49e9630bcedf44f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5},
 {Param(parent='LogisticRegression_41b6b49e9630bcedf44f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.7},
 {Param(parent='LogisticRegression_41b6b49e9630bcedf44f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.9}]