In [1]:
from pyspark.sql import Row,functions
from pyspark.ml.linalg import Vector,Vectors
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer,HashingTF, Tokenizer
from pyspark.ml.classification import LogisticRegression,LogisticRegressionModel,BinaryLogisticRegressionSummary, LogisticRegression
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("logistics regression").getOrCreate()

In [2]:
# 读取数据
# 用于将数据集特征和label切分开
def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[0]),float(x[1]),float(x[2]),float(x[3]))
    rel['label'] = str(x[4])
    return rel
 
data = spark.sparkContext.textFile("file:///home/mayfly/workstation/mycode/spark_study/iris.txt").map(lambda line: line.split(',')).map(lambda p: Row(**f(p)))


In [3]:
for index, t in enumerate(data.collect()):
    if index>5:
        break
    print(t)

Row(features=DenseVector([5.1, 3.5, 1.4, 0.2]), label='Iris-setosa')
Row(features=DenseVector([4.9, 3.0, 1.4, 0.2]), label='Iris-setosa')
Row(features=DenseVector([4.7, 3.2, 1.3, 0.2]), label='Iris-setosa')
Row(features=DenseVector([4.6, 3.1, 1.5, 0.2]), label='Iris-setosa')
Row(features=DenseVector([5.0, 3.6, 1.4, 0.2]), label='Iris-setosa')
Row(features=DenseVector([5.4, 3.9, 1.7, 0.4]), label='Iris-setosa')


In [4]:
# 将RDD转换为dataframe
data = data.toDF()

In [5]:
# 把刚刚得到的数据注册成一个表iris
data.createOrReplaceTempView("iris")
# 通过sql语句进行数据查询，由于不需要全部的三分类，所以去除一个分类
df = spark.sql("select * from iris where label != 'Iris-setosa'")
# 转换成rdd输出查看
rel = df.rdd.map(lambda t : str(t[1])+":"+str(t[0])).collect()
for item in rel:
    print(item)

Iris-versicolor:[7.0,3.2,4.7,1.4]
Iris-versicolor:[6.4,3.2,4.5,1.5]
Iris-versicolor:[6.9,3.1,4.9,1.5]
Iris-versicolor:[5.5,2.3,4.0,1.3]
Iris-versicolor:[6.5,2.8,4.6,1.5]
Iris-versicolor:[5.7,2.8,4.5,1.3]
Iris-versicolor:[6.3,3.3,4.7,1.6]
Iris-versicolor:[4.9,2.4,3.3,1.0]
Iris-versicolor:[6.6,2.9,4.6,1.3]
Iris-versicolor:[5.2,2.7,3.9,1.4]
Iris-versicolor:[5.0,2.0,3.5,1.0]
Iris-versicolor:[5.9,3.0,4.2,1.5]
Iris-versicolor:[6.0,2.2,4.0,1.0]
Iris-versicolor:[6.1,2.9,4.7,1.4]
Iris-versicolor:[5.6,2.9,3.6,1.3]
Iris-versicolor:[6.7,3.1,4.4,1.4]
Iris-versicolor:[5.6,3.0,4.5,1.5]
Iris-versicolor:[5.8,2.7,4.1,1.0]
Iris-versicolor:[6.2,2.2,4.5,1.5]
Iris-versicolor:[5.6,2.5,3.9,1.1]
Iris-versicolor:[5.9,3.2,4.8,1.8]
Iris-versicolor:[6.1,2.8,4.0,1.3]
Iris-versicolor:[6.3,2.5,4.9,1.5]
Iris-versicolor:[6.1,2.8,4.7,1.2]
Iris-versicolor:[6.4,2.9,4.3,1.3]
Iris-versicolor:[6.6,3.0,4.4,1.4]
Iris-versicolor:[6.8,2.8,4.8,1.4]
Iris-versicolor:[6.7,3.0,5.0,1.7]
Iris-versicolor:[6.0,2.9,4.5,1.5]
Iris-versicolo

In [6]:
# 开始构建pipeline

In [7]:
# 分别获取标签列和特征列，进行索引，并进行了重命名。
labelIndexer = StringIndexer().setInputCol("label").setOutputCol("indexedLabel").fit(df)
featureIndexer = VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").fit(df)
# featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_53b988077b38

In [8]:
# 把数据集随机分成训练集和测试集，其中训练集占70%
trainingData, testData = df.randomSplit([0.7,0.3])

In [9]:
# 设置logistic的参数，这里我们统一用setter的方法来设置，也可以用ParamMap来设置，设置了循环次数为10次，正则化项为0.3等，具体的可以设置的参数可以通过explainParams()来获取，还能看到我们已经设置的参数的结果
lr = LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
print("LogisticRegression parameters:\n" + lr.explainParams())

LogisticRegression parameters:
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.8)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: indexedFeatures)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: indexedLabel)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on inte

In [10]:
# 设置一个labelConverter，目的是把预测的类别重新转化成字符型的
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

In [11]:
# # 构建pipeline，设置stage，然后调用fit()来训练模型。
lrPipeline =  Pipeline().setStages([labelIndexer, featureIndexer, lr, labelConverter])
lrPipelineModel = lrPipeline.fit(trainingData)

In [12]:
# pipeline本质上是一个Estimator，当pipeline调用fit()的时候就产生了一个PipelineModel，本质上是一个Transformer。然后这个PipelineModel就可以调用transform()来进行预测，生成一个新的DataFrame，即利用训练得到的模型对测试集进行验证。
lrPredictions = lrPipelineModel.transform(testData)

In [13]:
# 出预测的结果，其中select选择要输出的列，collect获取所有行的数据，用foreach把每行打印出来。其中打印出来的值依次分别代表该行数据的真实分类和特征值、预测属于不同分类的概率、预测的分类。
preRel = lrPredictions.select("predictedLabel", "label", "features", "probability").collect()
for item in preRel:
    print(str(item['label'])+','+str(item['features'])+'-->prob='+str(item['probability'])+',predictedLabel'+str(item['predictedLabel']))

Iris-versicolor,[5.5,2.4,3.7,1.0]-->prob=[0.608831311288059,0.39116868871194105],predictedLabelIris-versicolor
Iris-virginica,[5.6,2.8,4.9,2.0]-->prob=[0.4001648611050161,0.5998351388949839],predictedLabelIris-virginica
Iris-versicolor,[5.6,3.0,4.5,1.5]-->prob=[0.505221948790782,0.494778051209218],predictedLabelIris-versicolor
Iris-virginica,[5.7,2.5,5.0,2.0]-->prob=[0.4011621306493747,0.5988378693506252],predictedLabelIris-virginica
Iris-versicolor,[5.7,2.8,4.5,1.3]-->prob=[0.5486724476582792,0.45132755234172084],predictedLabelIris-versicolor
Iris-versicolor,[5.8,2.6,4.0,1.2]-->prob=[0.570672297739129,0.4293277022608709],predictedLabelIris-versicolor
Iris-versicolor,[6.1,3.0,4.6,1.4]-->prob=[0.5316543395332591,0.468345660466741],predictedLabelIris-versicolor
Iris-virginica,[6.1,3.0,4.9,1.8]-->prob=[0.446765897766743,0.553234102233257],predictedLabelIris-virginica
Iris-versicolor,[6.2,2.2,4.5,1.5]-->prob=[0.511449640933197,0.488550359066803],predictedLabelIris-versicolor
Iris-virginica

In [14]:
# 创建一个MulticlassClassificationEvaluator实例，用setter方法把预测分类的列名和真实分类的列名进行设置；然后计算预测准确率和错误率
evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction")
lrAccuracy = evaluator.evaluate(lrPredictions)
print("Test Error = " + str(1.0 - lrAccuracy))

Test Error = 0.1428571428571428


In [15]:
# 通过model来获取我们训练得到的逻辑斯蒂模型。前面已经说过model是一个PipelineModel，因此我们可以通过调用它的stages来获取模型，具体如下
lrModel = lrPipelineModel.stages[2]
print("Coefficients: " + str(lrModel.coefficients)+"Intercept: "+str(lrModel.intercept)+"numClasses: "+str(lrModel.numClasses)+"numFeatures: "+str(lrModel.numFeatures))           

Coefficients: [-0.04153002805609655,0.0,0.0,0.0851333577302661]Intercept: -0.21398718619541965numClasses: 2numFeatures: 4


In [16]:
# spark的ml库还提供了一个对模型的摘要总结（summary），不过目前只支持二项逻辑斯蒂回归，而且要显式转化成BinaryLogisticRegressionSummary
# 首先获得二项逻辑斯模型的摘要
trainingSummary = lrModel.summary
# 获得10次循环中损失函数的变化，并将结果打印出来，可以看到损失函数随着循环是逐渐变小的
objectiveHistory = trainingSummary.objectiveHistory
for item in objectiveHistory:
    print(item)
print("------------------------------")
# 把摘要强制转化为BinaryLogisticRegressionSummary ，来获取用来评估模型性能的矩阵，通过获取ROC，我们可以判断模型的好坏，areaUnderROC达到了 0.969551282051282，说明我们的分类器还是不错的
print(trainingSummary.areaUnderROC)

# 通过最大化fMeasure来选取最合适的阈值，其中fMeasure是一个综合了召回率和准确率的指标，通过最大化fMeasure，我们可以选取到用来分类的最合适的阈值。
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.select(functions.max("F-Measure")).head()[0] 
print(maxFMeasure)
print("-----------------")
bestThreshold = fMeasure.where(fMeasure["F-Measure"]== maxFMeasure).select("threshold").head()[0]
print(bestThreshold)
lr.setThreshold(bestThreshold)

0.6927613284609757
0.6898340245452709
0.6877084111664772
0.6867085352031355
0.6799440366541055
0.6702999584351561
0.6656403590810849
0.6644955913087869
0.6631918088409949
0.6624783614222991
0.6615311087179351
------------------------------
0.9826254826254825
0.9565217391304348
-----------------
0.5408861192767206


LogisticRegression_4373b45d07955900227e

In [18]:
# 用多项逻辑斯蒂回归解决 二分类 问题
# 对于二分类问题，我们还可以用多项逻辑斯蒂回归进行多分类分析。多项逻辑斯蒂回归与二项逻辑斯蒂回归类似，只是在模型设置上把family参数设置成multinomial，
mlr =  LogisticRegression().setLabelCol("indexedLabel").setFeaturesCol("indexedFeatures").setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8).setFamily("multinomial")
mlrPipeline = Pipeline().setStages([labelIndexer, featureIndexer, mlr, labelConverter])
mlrPipelineModel = mlrPipeline.fit(trainingData)
mlrPredictions = mlrPipelineModel.transform(testData)
mlrPreRel = mlrPredictions.select("predictedLabel", "label", "features", "probability").collect()
for item in mlrPreRel:
    print('('+str(item['label'])+','+str(item['features'])+')-->prob='+str(item['probability'])+',predictLabel='+str(item['predictedLabel']))
mlrAccuracy = evaluator.evaluate(mlrPredictions)
print("Test Error = " + str(1.0 - mlrAccuracy)) 
mlrModel = mlrPipelineModel.stages[2]
print("Multinomial coefficients: " +str(mlrModel.coefficientMatrix)+"Multin omial intercepts: "+str(mlrModel.interceptVector)+"numClasses: "+str(mlrModel.numClasses)+
"numFeatures: "+str(mlrModel.numFeatures))
 

(Iris-versicolor,[5.5,2.4,3.7,1.0])-->prob=[0.6190870833392402,0.3809129166607598],predictLabel=Iris-versicolor
(Iris-virginica,[5.6,2.8,4.9,2.0])-->prob=[0.3899902744026083,0.6100097255973916],predictLabel=Iris-virginica
(Iris-versicolor,[5.6,3.0,4.5,1.5])-->prob=[0.5056604423154984,0.4943395576845015],predictLabel=Iris-versicolor
(Iris-virginica,[5.7,2.5,5.0,2.0])-->prob=[0.39164811792444604,0.608351882075554],predictLabel=Iris-virginica
(Iris-versicolor,[5.7,2.8,4.5,1.3])-->prob=[0.5541874662627947,0.4458125337372053],predictLabel=Iris-versicolor
(Iris-versicolor,[5.8,2.6,4.0,1.2])-->prob=[0.5789749933959147,0.4210250066040854],predictLabel=Iris-versicolor
(Iris-versicolor,[6.1,3.0,4.6,1.4])-->prob=[0.5377925194267595,0.46220748057324046],predictLabel=Iris-versicolor
(Iris-virginica,[6.1,3.0,4.9,1.8])-->prob=[0.4441002327295816,0.5558997672704183],predictLabel=Iris-virginica
(Iris-versicolor,[6.2,2.2,4.5,1.5])-->prob=[0.516100230459997,0.4838997695400031],predictLabel=Iris-versicolo

In [None]:
# 用多项逻辑斯蒂回归解决 多分类 问题
# 对于多分类问题，我们需要用多项逻辑斯蒂回归进行多分类分析。这里我们用全部的iris数据集，即有三个类别，过程与上述基本一致