In [None]:
'''
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

使用管道，特征编码，独热向量，特征转换，特征组合
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

pipeline = Pipeline(stages=[gender_indexer,embark_indexer,gender_encoder,embark_encoder,assembler,log_reg_titanic])
Pipeline.fit
'''

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [2]:
spark = SparkSession.builder.appName("PySpark04").master("local[2]"). \
config("spark.sql.warehouse.dir","file:///E:/input/spark/warehouse").getOrCreate()

In [7]:
data = spark.read.format('csv').load("file:///e:/Download/titanic/train.csv",inferSchema=True,header=True)

In [52]:
data.columns

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [8]:
data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [23]:
my_cols = data.select(['Survived', 'Pclass', 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked'
 ])

In [24]:
final_data = my_cols.na.drop()

In [15]:
# norminal, ordered, descrete, continue, 参考吉米的
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

In [19]:
# StringIndexer
# a b c
# 0 1 2
# one hot encode
# A:[1,0,0], B:[0,1,0], C:[0,0,1]
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex') #性别是norminal,采用StringIndexer
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')

In [20]:
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')

In [21]:
# 挑选ETL过后的特征组合成特征向量
assembler = VectorAssembler(inputCols=['Pclass','SexVec','EmbarkVec','Age','SibSp','Parch','Fare'],outputCol='features')

In [25]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [27]:
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')

In [29]:
# 简单管道，像是estimator
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,gender_encoder,embark_encoder,assembler,log_reg_titanic])

In [36]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [37]:
fit_model = pipeline.fit(train_data)

In [39]:
type(assembler)

pyspark.ml.feature.VectorAssembler

In [43]:
results = fit_model.transform(test_data) # transform自动调用prediction

In [44]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [51]:
#默认参数
my_eval=BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
    labelCol='label',
    metricName='areaUnderROC')

In [56]:
# 比较预测值和实际值，找到对应标签
my_eval=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Survived')

In [57]:
results.columns # 原始栏位+StringIndexer + Vector + StringIndexer + Vector  + features(VectorAssembler)
# + 新产生的三个栏位'rawPrediction', 'probability','prediction'

['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked',
 'SexIndex',
 'EmbarkIndex',
 'SexVec',
 'EmbarkVec',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

In [62]:
results.show(3)

+--------+------+------+----+-----+-----+------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|  Fare|Embarked|SexIndex|EmbarkIndex|       SexVec|    EmbarkVec|            features|       rawPrediction|         probability|prediction|
+--------+------+------+----+-----+-----+------+--------+--------+-----------+-------------+-------------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|25.0|    1|    2|151.55|       S|     1.0|        0.0|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,1.0,0.0,...|[-2.4010380036288...|[0.08309357776745...|       1.0|
|       0|     1|  male|19.0|    1|    0|  53.1|       S|     0.0|        0.0|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,1.0,0.0,...|[-0.3199121363285...|[0.42069716105050...|       1.0|
|       0|     1|  male|27.0|    0|    2| 211.5|       C|     0.0|        1.0|(1,[0],[1.0])|(2,

In [59]:
my_eval.evaluate(results) # AUC

0.8430134680134681

In [None]:
my_eval.

In [60]:
results.select('SexVec','EmbarkVec',
 'features',
 'rawPrediction',
 'probability',
 'prediction','Survived').show()

+-------------+-------------+--------------------+--------------------+--------------------+----------+--------+
|       SexVec|    EmbarkVec|            features|       rawPrediction|         probability|prediction|Survived|
+-------------+-------------+--------------------+--------------------+--------------------+----------+--------+
|    (1,[],[])|(2,[0],[1.0])|[1.0,0.0,1.0,0.0,...|[-2.4010380036288...|[0.08309357776745...|       1.0|       0|
|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,1.0,0.0,...|[-0.3199121363285...|[0.42069716105050...|       1.0|       0|
|(1,[0],[1.0])|(2,[1],[1.0])|[1.0,1.0,0.0,1.0,...|[-0.4496639657333...|[0.38944066417825...|       1.0|       0|
|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,1.0,0.0,...|[0.44382160796280...|[0.60916926573062...|       0.0|       0|
|(1,[0],[1.0])|(2,[0],[1.0])|(8,[0,1,2,4],[1.0...|[0.27821134104686...|[0.56910765592852...|       0.0|       0|
|(1,[0],[1.0])|(2,[0],[1.0])|[1.0,1.0,1.0,0.0,...|[0.49912630375590...|[0.62225398738401...|    