 # 字符串转成数字StringIndexer

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-python").getOrCreate()

from pyspark.ml.feature import StringIndexer
data=[
    ["李四",15,"高中"],
    ["王五",19,"本科"],
    ["张三",18,"硕士"],
    ["赵6",20,"本科"]
]
columns=["name","age","学历"]
df = spark.createDataFrame(data,schema=columns)
df.show()

+----+---+----+
|name|age|学历|
+----+---+----+
|李四| 15|高中|
|王五| 19|本科|
|张三| 18|硕士|
| 赵6| 20|本科|
+----+---+----+



In [2]:
obj = StringIndexer(inputCol="学历", outputCol="学历_2")#返回transfomer对象
result = obj.fit(df)#返回transfomer对象,fit训练,并储存中间结果
result.labels

['本科', '硕士', '高中']

In [3]:
df2 = result.transform(df)
df2.show()

+----+---+----+------+
|name|age|学历|学历_2|
+----+---+----+------+
|李四| 15|高中|   2.0|
|王五| 19|本科|   0.0|
|张三| 18|硕士|   1.0|
| 赵6| 20|本科|   0.0|
+----+---+----+------+



# Binarizer 二值化:连续性
离散型的二值化：StringIndexer

In [4]:
from pyspark.ml.feature import Binarizer
df = spark.createDataFrame(
    [
        ['zhang3', 18, 65.5, '高中'],
        ['li4',19, 50.2, '本科'],
        ['wang5', 20,61.2, '本科'],
        ['zhao6', 21, 50.8, '本科'],
        ['zheng7', 22, 77.2, '硕士'],
        ['zhou8',22, 80.7, '硕士'],
    ], schema=['姓名','年龄','体重','学历']
)
df.show()

+------+----+----+----+
|  姓名|年龄|体重|学历|
+------+----+----+----+
|zhang3|  18|65.5|高中|
|   li4|  19|50.2|本科|
| wang5|  20|61.2|本科|
| zhao6|  21|50.8|本科|
|zheng7|  22|77.2|硕士|
| zhou8|  22|80.7|硕士|
+------+----+----+----+



In [5]:
obj = Binarizer(inputCol="体重",outputCol="体重_二值化",threshold=60)
obj.transform(df).show()

+------+----+----+----+-----------+
|  姓名|年龄|体重|学历|体重_二值化|
+------+----+----+----+-----------+
|zhang3|  18|65.5|高中|        1.0|
|   li4|  19|50.2|本科|        0.0|
| wang5|  20|61.2|本科|        1.0|
| zhao6|  21|50.8|本科|        0.0|
|zheng7|  22|77.2|硕士|        1.0|
| zhou8|  22|80.7|硕士|        1.0|
+------+----+----+----+-----------+



#  使用Bucketizer transformer 分桶

In [6]:
#使用Bucketizer transformer是Binarizer的通用版本，它可以将列值转换成您所选择的桶。
#控制桶的数量以及每个桶的取值范围的方法是通过指定一个double值数组的桶边界列表。

In [7]:
from pyspark.ml.feature import Bucketizer
splits = [0.0, 55.0, 70.0, 90.0]
obj = Bucketizer(inputCol="体重",outputCol="体重_分桶", splits=splits)
obj.transform(df).show()

+------+----+----+----+---------+
|  姓名|年龄|体重|学历|体重_分桶|
+------+----+----+----+---------+
|zhang3|  18|65.5|高中|      1.0|
|   li4|  19|50.2|本科|      0.0|
| wang5|  20|61.2|本科|      1.0|
| zhao6|  21|50.8|本科|      0.0|
|zheng7|  22|77.2|硕士|      2.0|
| zhou8|  22|80.7|硕士|      2.0|
+------+----+----+----+---------+



#  使用Tokenizer Transformer

In [8]:
from pyspark.ml.feature import Tokenizer

df = spark.createDataFrame(
    [
        (1, "Spark is a unified data data analytics engine engine engine"),
        (2, "It is fun to work with Spark"),
        (3, "There is a lot of exciting sessions at upcoming Spark summit"),
        (4, "mllib transformer estimator evaluator and pipelines")
    ], 
    schema=["id", "line"])

obj = Tokenizer(inputCol="line",outputCol="words")
df2 = obj.transform(df)
df2.select('words').show(truncate=False) #默认会显示不全，加上truncate=False

+------------------------------------------------------------------------+
|words                                                                   |
+------------------------------------------------------------------------+
|[spark, is, a, unified, data, data, analytics, engine, engine, engine]  |
|[it, is, fun, to, work, with, spark]                                    |
|[there, is, a, lot, of, exciting, sessions, at, upcoming, spark, summit]|
|[mllib, transformer, estimator, evaluator, and, pipelines]              |
+------------------------------------------------------------------------+



#  使用StopWordsRemover transformer

In [9]:
# 使用StopWordsRemover transformer来删除Tokenization示例中的单词中的英语停止词
from pyspark.ml.feature import StopWordsRemover
#装载英文的停用词
enStopWords = StopWordsRemover.loadDefaultStopWords("english")
remover = StopWordsRemover(stopWords=enStopWords,inputCol="words",outputCol="filtered")
# 来自上一示例中的tokenized
df3 = remover.transform(df2)
df3.select('filtered').show(truncate=False)
#df2.select('words').show(truncate=False)

+---------------------------------------------------------------+
|filtered                                                       |
+---------------------------------------------------------------+
|[spark, unified, data, data, analytics, engine, engine, engine]|
|[fun, work, spark]                                             |
|[lot, exciting, sessions, upcoming, spark, summit]             |
|[mllib, transformer, estimator, evaluator, pipelines]          |
+---------------------------------------------------------------+



#  使用HashingTF transformer

In [10]:
#numFeatures为HashingTF类的成员变量默认为2^20，也就是hash的维数。
#数字就是单词的哈希值，值就是单词的频数
#哈希值的目的：计算单词重复出现的次数，并把单词用数字表示
#接收词条的集合然后把这些集合转化成固定 长度的特征向量
from pyspark.ml.feature import HashingTF
tf = HashingTF(inputCol="filtered", outputCol="TFOut", numFeatures=262144)
tfResult = tf.transform(df3)
tfResult.select('TFOut').show(truncate=False) #哈希总数，哈希值，个数
df3.select('filtered').show(truncate=False)

+-------------------------------------------------------------------------+
|TFOut                                                                    |
+-------------------------------------------------------------------------+
|(262144,[1461,7473,110213,160735,234657],[1.0,3.0,1.0,2.0,1.0])          |
|(262144,[8443,34343,234657],[1.0,1.0,1.0])                               |
|(262144,[3023,8916,14250,128231,166806,234657],[1.0,1.0,1.0,1.0,1.0,1.0])|
|(262144,[91106,163638,165972,166537,191938],[1.0,1.0,1.0,1.0,1.0])       |
+-------------------------------------------------------------------------+

+---------------------------------------------------------------+
|filtered                                                       |
+---------------------------------------------------------------+
|[spark, unified, data, data, analytics, engine, engine, engine]|
|[fun, work, spark]                                             |
|[lot, exciting, sessions, upcoming, spark, summit]          

#  使用VectorAssembler transformer来组合特征到一个Vecotr特征

In [11]:
from pyspark.ml.feature import VectorAssembler
arrival_features = spark.createDataFrame(
    [
        (18, 95.1, True),
        (5, 65.7, True),
        (15, 31.5,False),
        (14, 40.5, False)
    ], ["hour", "temperature","on_time"])
arrival_features

DataFrame[hour: bigint, temperature: double, on_time: boolean]

In [12]:
assembler = VectorAssembler(
                inputCols = ["hour","temperature", "on_time"],
                outputCol="features"
            )
output = assembler.transform(arrival_features)
output.show(truncate=False)

+----+-----------+-------+---------------+
|hour|temperature|on_time|features       |
+----+-----------+-------+---------------+
|18  |95.1       |true   |[18.0,95.1,1.0]|
|5   |65.7       |true   |[5.0,65.7,1.0] |
|15  |31.5       |false  |[15.0,31.5,0.0]|
|14  |40.5       |false  |[14.0,40.5,0.0]|
+----+-----------+-------+---------------+



In [13]:
output.select('features').collect()

[Row(features=DenseVector([18.0, 95.1, 1.0])),
 Row(features=DenseVector([5.0, 65.7, 1.0])),
 Row(features=DenseVector([15.0, 31.5, 0.0])),
 Row(features=DenseVector([14.0, 40.5, 0.0]))]

In [14]:
df.show()

+---+--------------------+
| id|                line|
+---+--------------------+
|  1|Spark is a unifie...|
|  2|It is fun to work...|
|  3|There is a lot of...|
|  4|mllib transformer...|
+---+--------------------+



#  独热编码 OneHotEncoder

In [15]:
from pyspark.ml.feature import OneHotEncoder
df = spark.createDataFrame(
    [
        ['zhang3', 0, 65.5, '高中'],
        ['li4',1, 50.2, '本科'],
        ['wang5', 2,61.2, '本科'],
        ['zhao6', 2, 50.8, '本科'],
        ['zheng7', 3, 77.2, '硕士'],
        ['zhou8',3, 80.7, '硕士'],
    ], schema=['姓名','年龄','体重','学历']
)

obj = OneHotEncoder(inputCol='年龄', outputCol='年龄2',dropLast=False)
df2 = obj.transform(df)
df2.show()

+------+----+----+----+-------------+
|  姓名|年龄|体重|学历|        年龄2|
+------+----+----+----+-------------+
|zhang3|   0|65.5|高中|(4,[0],[1.0])|
|   li4|   1|50.2|本科|(4,[1],[1.0])|
| wang5|   2|61.2|本科|(4,[2],[1.0])|
| zhao6|   2|50.8|本科|(4,[2],[1.0])|
|zheng7|   3|77.2|硕士|(4,[3],[1.0])|
| zhou8|   3|80.7|硕士|(4,[3],[1.0])|
+------+----+----+----+-------------+

