In [1]:
# 导入所需的包
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col

# 加载数据
file = "file:/root/pyspark_code/pyspark/data/train.csv"

titanic_data = spark.read.option("header", "true").option("inferSchema","true").csv(file)

# 简单探索
print("乘客共有" + str(titanic_data.count()) + "位")
titanic_data.show(5,truncate=False)

ModuleNotFoundError: No module named 'pyspark'

In [3]:
# 探索模式schema
titanic_data.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [4]:
# 首先，我们只使用三个特征过滤出年龄为空的行

titanic_data1 = titanic_data.select(
	col('Survived').alias('label'),
	col('Pclass').alias('ticket_class'),
	col('Sex').alias('gender'),
	col('Age').alias('age'),
	).filter('age is not null')

# 将数据分成训练集（80%）和测试集（20%）
training, test = titanic_data1.randomSplit((0.8, 0.2))

print(training.count(), test.count())

581 133


In [5]:
training.show(10)

+-----+------------+------+----+
|label|ticket_class|gender| age|
+-----+------------+------+----+
|    0|           1|female| 2.0|
|    0|           1|female|25.0|
|    0|           1|female|50.0|
|    0|           1|  male|18.0|
|    0|           1|  male|19.0|
|    0|           1|  male|21.0|
|    0|           1|  male|22.0|
|    0|           1|  male|24.0|
|    0|           1|  male|24.0|
|    0|           1|  male|27.0|
+-----+------------+------+----+
only showing top 10 rows



In [7]:
# estimator: 将gender字符串转换为数值
genderIndxr = StringIndexer(inputCol="gender", outputCol="genderIdx")

# transfomer: 将这些特征组合成一个矢量
assembler = VectorAssembler(inputCols=["ticket_class","genderIdx", "age"], outputCol="features")

# estimator: logistic回归算法。family值："auto"、"binomial"、"multinomial"
logisticRegression = LogisticRegression(family="binomial")

# 设置三个阶段的管道
pipeline = Pipeline(stages=(genderIndxr,assembler,logisticRegression))

# 使用训练数据集训练该算法
model = pipeline.fit(training)

# 执行预测
predictions = model.transform(test) #多出一列prediction
predictions.show(10,truncate=False)

+-----+------------+------+----+---------+--------------+------------------------------------------+----------------------------------------+----------+
|label|ticket_class|gender|age |genderIdx|features      |rawPrediction                             |probability                             |prediction|
+-----+------------+------+----+---------+--------------+------------------------------------------+----------------------------------------+----------+
|0    |1           |male  |19.0|0.0      |[1.0,0.0,19.0]|[-0.5741042873386117,0.5741042873386117]  |[0.3602903223516371,0.639709677648363]  |1.0       |
|0    |1           |male  |28.0|0.0      |[1.0,0.0,28.0]|[-0.22280644139050043,0.22280644139050043]|[0.4445276830207755,0.5554723169792245] |1.0       |
|0    |1           |male  |31.0|0.0      |[1.0,0.0,31.0]|[-0.10570715940779607,0.10570715940779607]|[0.47359779043602906,0.526402209563971] |1.0       |
|0    |1           |male  |31.0|0.0      |[1.0,0.0,31.0]|[-0.10570715940779607,0.1

In [8]:
# 执行模型性能的评估，默认的度量标准是ROC下面的面积
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.8822463768115945