In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("PySpark_DataFrame").master("local[2]").\
config("spark.sql.warehouse.dir","file:///E:/input/spark/warehouse").getOrCreate()

In [40]:
from pyspark.ml.feature import Tokenizer,HashingTF,CountVectorizer,IDF, Word2Vec,FeatureHasher

In [37]:
text = spark.createDataFrame([
    (0.0, "Hi I heard about Spark Spark Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

In [38]:
tokenizer = Tokenizer(inputCol='sentence',outputCol='words')
hashingTF = HashingTF(inputCol="words",outputCol="rawFeatures",numFeatures=20) # 最大特征数，返回sparse vector
idf = IDF(inputCol="rawFeatures",outputCol="features")

wordsData = tokenizer.transform(text)
featurizedData = hashingTF.transform(wordsData)


In [39]:
featurizedData.collect()

[Row(label=0.0, sentence='Hi I heard about Spark Spark Spark', words=['hi', 'i', 'heard', 'about', 'spark', 'spark', 'spark'], rawFeatures=SparseVector(20, {0: 1.0, 5: 3.0, 9: 1.0, 17: 2.0})),
 Row(label=0.0, sentence='I wish Java could use case classes', words=['i', 'wish', 'java', 'could', 'use', 'case', 'classes'], rawFeatures=SparseVector(20, {2: 1.0, 7: 1.0, 9: 3.0, 13: 1.0, 15: 1.0})),
 Row(label=1.0, sentence='Logistic regression models are neat', words=['logistic', 'regression', 'models', 'are', 'neat'], rawFeatures=SparseVector(20, {4: 1.0, 6: 1.0, 13: 1.0, 15: 1.0, 18: 1.0}))]

In [25]:
idfModel = idf.fit(featurizedData) # IDF是一个estimator
rescaleData = idfModel.transform(featurizedData)
rescaleData.collect()

[Row(label=0.0, sentence='Hi I heard about Spark Spark Spark Spark', words=['hi', 'i', 'heard', 'about', 'spark', 'spark', 'spark', 'spark'], rawFeatures=SparseVector(20, {0: 1.0, 5: 4.0, 9: 1.0, 17: 2.0}), features=SparseVector(20, {0: 0.6931, 5: 2.7726, 9: 0.2877, 17: 1.3863})),
 Row(label=0.0, sentence='I wish Java could use case classes', words=['i', 'wish', 'java', 'could', 'use', 'case', 'classes'], rawFeatures=SparseVector(20, {2: 1.0, 7: 1.0, 9: 3.0, 13: 1.0, 15: 1.0}), features=SparseVector(20, {2: 0.6931, 7: 0.6931, 9: 0.863, 13: 0.2877, 15: 0.2877})),
 Row(label=1.0, sentence='Logistic regression models are neat', words=['logistic', 'regression', 'models', 'are', 'neat'], rawFeatures=SparseVector(20, {4: 1.0, 6: 1.0, 13: 1.0, 15: 1.0, 18: 1.0}), features=SparseVector(20, {4: 0.6931, 6: 0.6931, 13: 0.2877, 15: 0.2877, 18: 0.6931}))]

In [33]:
# Word2Vec是一个estimator, 返回dense向量
word2vec = Word2Vec(vectorSize=3, minCount=0, inputCol="words",outputCol="word2vec") # 输入必须是数组类型
wmodel = word2vec.fit(wordsData)
result2 = wmodel.transform(wordsData)
result2.collect()

[Row(label=0.0, sentence='Hi I heard about Spark Spark Spark Spark', words=['hi', 'i', 'heard', 'about', 'spark', 'spark', 'spark', 'spark'], word2vec=DenseVector([0.0408, 0.1435, -0.0334])),
 Row(label=0.0, sentence='I wish Java could use case classes', words=['i', 'wish', 'java', 'could', 'use', 'case', 'classes'], word2vec=DenseVector([0.0066, -0.0157, -0.0707])),
 Row(label=1.0, sentence='Logistic regression models are neat', words=['logistic', 'regression', 'models', 'are', 'neat'], word2vec=DenseVector([0.114, -0.0485, 0.0668]))]

In [36]:
chars = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])


cv = CountVectorizer(inputCol="words",outputCol="features", vocabSize=3,minDF=2.0)
model = cv.fit(chars)
result3 = model.transform(chars)
result3.collect()

[Row(id=0, words=['a', 'b', 'c'], features=SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0})),
 Row(id=1, words=['a', 'b', 'b', 'c', 'a'], features=SparseVector(3, {0: 2.0, 1: 2.0, 2: 1.0}))]

In [41]:
from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                 outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [5]:
from pyspark.ml.feature import VectorIndexer

data = spark.read.format("libsvm").load("file:///E://download/sample_libsvm_data.txt") #692个特征

indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel = indexer.fit(data)

categoricalFeatures = indexerModel.categoryMaps #选择351个
print("Chose %d categorical features: %s" %
      (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))

# Create new column "indexed" with categorical values transformed to indices
indexedData = indexerModel.transform(data)
indexedData.show()

Chose 351 categorical features: 645, 69, 365, 138, 101, 479, 333, 249, 0, 555, 666, 88, 170, 115, 276, 308, 5, 449, 120, 247, 614, 677, 202, 10, 56, 533, 142, 500, 340, 670, 174, 42, 417, 24, 37, 25, 257, 389, 52, 14, 504, 110, 587, 619, 196, 559, 638, 20, 421, 46, 93, 284, 228, 448, 57, 78, 29, 475, 164, 591, 646, 253, 106, 121, 84, 480, 147, 280, 61, 221, 396, 89, 133, 116, 1, 507, 312, 74, 307, 452, 6, 248, 60, 117, 678, 529, 85, 201, 220, 366, 534, 102, 334, 28, 38, 561, 392, 70, 424, 192, 21, 137, 165, 33, 92, 229, 252, 197, 361, 65, 97, 665, 583, 285, 224, 650, 615, 9, 53, 169, 593, 141, 610, 420, 109, 256, 225, 339, 77, 193, 669, 476, 642, 637, 590, 679, 96, 393, 647, 173, 13, 41, 503, 134, 73, 105, 2, 508, 311, 558, 674, 530, 586, 618, 166, 32, 34, 148, 45, 161, 279, 64, 689, 17, 149, 584, 562, 176, 423, 191, 22, 44, 59, 118, 281, 27, 641, 71, 391, 12, 445, 54, 313, 611, 144, 49, 335, 86, 672, 172, 113, 681, 219, 419, 81, 230, 362, 451, 76, 7, 39, 649, 98, 616, 477, 367, 535, 1

In [6]:
data.collect()[0].features

SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0, 356: 253.0, 357: 252.0,

In [7]:
len(data.collect()),len(data.collect()[0].features),len(indexedData.collect()[0].indexed),len(categoricalFeatures[0])

(100, 692, 692, 1)

In [12]:
indexedData.collect()[0].features == indexedData.collect()[0].indexed

True

In [53]:
#测试VectorIndexer
from pyspark.ml.linalg import Vectors
tmp = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0,-2.0,10]),),
    (1, Vectors.dense([2.0, 1.0, 1.0,-2.0,0.5]),),
    (2, Vectors.dense([4.0, 10.0, 2.0,4.0,8]),),
    (3, Vectors.dense([4.0, 10.0, 1.0,4.0,80]),),
    (4, Vectors.dense([4.0, 10.0, 5.0,4.0,80]),),
    (5, Vectors.dense([4.0, 10.0, 3.0,6.0,100]),)
], ["id", "features"])

indexer_tmp = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)
indexerModel_tmp = indexer_tmp.fit(tmp)

categoricalFeatures_tmp = indexerModel_tmp.categoryMaps #选择351个
print("Chose %d categorical features: %s" %
      (len(categoricalFeatures_tmp), ", ".join(str(k) for k in categoricalFeatures_tmp.keys())))
# Create new column "indexed" with categorical values transformed to indices
indexedData_tmp = indexerModel_tmp.transform(tmp)
indexedData_tmp.show()

Chose 5 categorical features: 0, 1, 2, 3, 4
+---+--------------------+--------------------+
| id|            features|             indexed|
+---+--------------------+--------------------+
|  0|[1.0,0.5,-1.0,-2....|[0.0,0.0,0.0,0.0,...|
|  1|[2.0,1.0,1.0,-2.0...|[1.0,1.0,1.0,0.0,...|
|  2|[4.0,10.0,2.0,4.0...|[2.0,2.0,2.0,1.0,...|
|  3|[4.0,10.0,1.0,4.0...|[2.0,2.0,1.0,1.0,...|
|  4|[4.0,10.0,5.0,4.0...|[2.0,2.0,4.0,1.0,...|
|  5|[4.0,10.0,3.0,6.0...|[2.0,2.0,3.0,2.0,...|
+---+--------------------+--------------------+



In [54]:
indexedData_tmp.select("indexed").collect()

[Row(indexed=DenseVector([0.0, 0.0, 0.0, 0.0, 2.0])),
 Row(indexed=DenseVector([1.0, 1.0, 1.0, 0.0, 0.0])),
 Row(indexed=DenseVector([2.0, 2.0, 2.0, 1.0, 1.0])),
 Row(indexed=DenseVector([2.0, 2.0, 1.0, 1.0, 3.0])),
 Row(indexed=DenseVector([2.0, 2.0, 4.0, 1.0, 3.0])),
 Row(indexed=DenseVector([2.0, 2.0, 3.0, 2.0, 4.0]))]

In [55]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)
print("Normalized using L^1 norm")
l1NormData.show()

# 注意：规范化时行方向的
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)
l2NormData = normalizer.transform(dataFrame)
print("Normalized using L^2 norm")
l2NormData.show()

# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
print("Normalized using L^inf norm")
lInfNormData.show()




Normalized using L^1 norm
+---+--------------+------------------+
| id|      features|      normFeatures|
+---+--------------+------------------+
|  0|[1.0,0.5,-1.0]|    [0.4,0.2,-0.4]|
|  1| [2.0,1.0,1.0]|   [0.5,0.25,0.25]|
|  2|[4.0,10.0,2.0]|[0.25,0.625,0.125]|
+---+--------------+------------------+

Normalized using L^2 norm
+---+--------------+--------------------+
| id|      features|        normFeatures|
+---+--------------+--------------------+
|  0|[1.0,0.5,-1.0]|[0.66666666666666...|
|  1| [2.0,1.0,1.0]|[0.81649658092772...|
|  2|[4.0,10.0,2.0]|[0.36514837167011...|
+---+--------------+--------------------+

Normalized using L^inf norm
+---+--------------+--------------+
| id|      features|  normFeatures|
+---+--------------+--------------+
|  0|[1.0,0.5,-1.0]|[1.0,0.5,-1.0]|
|  1| [2.0,1.0,1.0]| [1.0,0.5,0.5]|
|  2|[4.0,10.0,2.0]| [0.4,1.0,0.2]|
+---+--------------+--------------+



In [35]:
from functools import reduce
l2NormData.collect()[0].normFeatures,sum([x*x for x in l2NormData.collect()[0].normFeatures] )

(DenseVector([0.6667, 0.3333, -0.6667]), 1.0)

In [82]:
from pyspark.ml.feature import StandardScaler

dataFrame = spark.read.format("libsvm").load("file:///E:/download/sample_libsvm_data.txt")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=True)

# Compute summary statistics by fitting the StandardScaler
scalerModel = scaler.fit(dataFrame)

# Normalize each feature to have unit standard deviation.
scaledData = scalerModel.transform(dataFrame)
scaledData.show()

+-----+--------------------+--------------------+
|label|            features|      scaledFeatures|
+-----+--------------------+--------------------+
|  0.0|(692,[127,128,129...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[158,159,160...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[124,125,126...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[152,153,154...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[151,152,153...|[0.0,0.0,0.0,0.0,...|
|  0.0|(692,[129,130,131...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[158,159,160...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[99,100,101,...|[0.0,0.0,0.0,0.0,...|
|  0.0|(692,[154,155,156...|[0.0,0.0,0.0,0.0,...|
|  0.0|(692,[127,128,129...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[154,155,156...|[0.0,0.0,0.0,0.0,...|
|  0.0|(692,[153,154,155...|[0.0,0.0,0.0,0.0,...|
|  0.0|(692,[151,152,153...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[129,130,131...|[0.0,0.0,0.0,0.0,...|
|  0.0|(692,[154,155,156...|[0.0,0.0,0.0,0.0,...|
|  1.0|(692,[150,151,152...|[0.0,0.0,0.0,0.0,...|
|  0.0|(692,[124,125,126...|[0.0,0.0,0.0,0.0,...|


In [87]:
aa =scaledData.collect()[0].scaledFeatures
sum([x for x in aa])

0.0

In [85]:
from pyspark.ml.feature import MinMaxScaler # 列方向
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["id", "features"])

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(dataFrame)
print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scaledData.select("features", "scaledFeatures").show()

Features scaled to range: [0.000000, 1.000000]
+--------------+--------------+
|      features|scaledFeatures|
+--------------+--------------+
|[1.0,0.1,-1.0]| [0.0,0.0,0.0]|
| [2.0,1.1,1.0]| [0.5,0.1,0.5]|
|[3.0,10.1,3.0]| [1.0,1.0,1.0]|
+--------------+--------------+



In [88]:
from pyspark.mllib.linalg import Matrix, Matrices

# Create a dense matrix ((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
dm2 = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6])

# Create a sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8])

In [89]:
dm2,sm

(DenseMatrix(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0], False),
 SparseMatrix(3, 2, [0, 1, 3], [0, 2, 1], [9.0, 6.0, 8.0], False))

In [91]:
from pyspark.mllib.linalg.distributed import RowMatrix

# Create an RDD of vectors.
rows = spark.sparkContext.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

# Create a RowMatrix from an RDD of vectors.
mat = RowMatrix(rows)

# Get its size.
m = mat.numRows()  # 4
n = mat.numCols()  # 3

# Get the rows as an RDD of vectors again.
rowsRDD = mat.rows

In [92]:
rowsRDD.collect()

[DenseVector([1.0, 2.0, 3.0]),
 DenseVector([4.0, 5.0, 6.0]),
 DenseVector([7.0, 8.0, 9.0]),
 DenseVector([10.0, 11.0, 12.0])]

In [119]:
from pyspark.ml.feature import Normalizer,StandardScaler,MinMaxScaler,MaxAbsScaler #方向比较
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["id", "features"])

normalScaler = Normalizer(inputCol="features", outputCol="normalFeatures",p=2.0)
standScaler = StandardScaler(inputCol="features", outputCol="standFeatures",withStd=True, withMean=True)
minMaxScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
maxAbsScaler = MaxAbsScaler(inputCol="features", outputCol="maxAbsFeatures")

# Compute summary statistics and generate MinMaxScalerModel
nmodel = normalScaler.transform(dataFrame) # normalize没有fit
ssModel = standScaler.fit(dataFrame)
mmModel = minMaxScaler.fit(dataFrame)
maModel = maxAbsScaler.fit(dataFrame)

# rescale each feature to range [min, max].
#scaledData = scalerModel.transform(dataFrame)
scaledData = mmModel.transform(dataFrame)
print("min max Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
scaledData.select("features", "scaledFeatures").show()

# rescale each feature to range [min, max].
scaledData2 = ssModel.transform(dataFrame)
print("standard Features")
scaledData2.select("features", "standFeatures").show()

# rescale each feature to range [min, max].
#scaledData3 = ssModel.transform(dataFrame)
scaledData3 = nmodel
print("normal Features")
scaledData3.select("features", "normalFeatures").show()


# rescale each feature to range [min, max].
scaledData4 = maModel.transform(dataFrame)
print("max abs Features")
scaledData4.select("features", "maxAbsFeatures").show()

min max Features scaled to range: [0.000000, 1.000000]
+--------------+--------------+
|      features|scaledFeatures|
+--------------+--------------+
|[1.0,0.1,-1.0]| [0.0,0.0,0.0]|
| [2.0,1.1,1.0]| [0.5,0.1,0.5]|
|[3.0,10.1,3.0]| [1.0,1.0,1.0]|
+--------------+--------------+

standard Features
+--------------+--------------------+
|      features|       standFeatures|
+--------------+--------------------+
|[1.0,0.1,-1.0]|[-1.0,-0.66575028...|
| [2.0,1.1,1.0]|[0.0,-0.484182026...|
|[3.0,10.1,3.0]|[1.0,1.1499323120...|
+--------------+--------------------+

normal Features
+--------------+--------------------+
|      features|      normalFeatures|
+--------------+--------------------+
|[1.0,0.1,-1.0]|[0.70534561585859...|
| [2.0,1.1,1.0]|[0.80257235390512...|
|[3.0,10.1,3.0]|[0.27384986857909...|
+--------------+--------------------+

max abs Features
+--------------+--------------------+
|      features|      maxAbsFeatures|
+--------------+--------------------+
|[1.0,0.1,-1.0]|[0.33

In [120]:
scaledData2.select('standFeatures').collect(),scaledData3.select('normalFeatures').collect(),scaledData4.select('maxAbsFeatures').collect()

([Row(standFeatures=DenseVector([-1.0, -0.6658, -1.0])),
  Row(standFeatures=DenseVector([0.0, -0.4842, 0.0])),
  Row(standFeatures=DenseVector([1.0, 1.1499, 1.0]))],
 [Row(normalFeatures=DenseVector([0.7053, 0.0705, -0.7053])),
  Row(normalFeatures=DenseVector([0.8026, 0.4414, 0.4013])),
  Row(normalFeatures=DenseVector([0.2738, 0.922, 0.2738]))],
 [Row(maxAbsFeatures=DenseVector([0.3333, 0.0099, -0.3333])),
  Row(maxAbsFeatures=DenseVector([0.6667, 0.1089, 0.3333])),
  Row(maxAbsFeatures=DenseVector([1.0, 1.0, 1.0]))])

In [121]:
scaledData3.select('normalFeatures').collect()

[Row(normalFeatures=DenseVector([0.7053, 0.0705, -0.7053])),
 Row(normalFeatures=DenseVector([0.8026, 0.4414, 0.4013])),
 Row(normalFeatures=DenseVector([0.2738, 0.922, 0.2738]))]

In [122]:
aa = scaledData3.select('normalFeatures').collect()[0].normalFeatures

DenseVector([0.7053, 0.0705, -0.7053])

In [130]:
from pyspark.mllib.linalg import Matrix,Matrices


In [158]:
import numpy as np
nn =np.matrix([[0.7053, 0.0705, -0.7053],[0.8026, 0.4414, 0.4013],[0.2738, 0.922, 0.2738]])
aa =np.array([[0.7053, 0.0705, -0.7053],[0.8026, 0.4414, 0.4013],[0.2738, 0.922, 0.2738]])

In [167]:
nn

matrix([[ 0.7053,  0.0705, -0.7053],
        [ 0.8026,  0.4414,  0.4013],
        [ 0.2738,  0.922 ,  0.2738]])

In [168]:
aa

array([[ 0.7053,  0.0705, -0.7053],
       [ 0.8026,  0.4414,  0.4013],
       [ 0.2738,  0.922 ,  0.2738]])

In [169]:
aa[0]

array([ 0.7053,  0.0705, -0.7053])

In [170]:
sum(x*x for x in aa[2])

1.00001688

In [171]:
sum(x*x for x in aa[:,0])

1.2165812900000001

In [57]:
# 分桶
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

Bucketizer output with 4 buckets
+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+



In [59]:
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

# Create some vector data; also works for sparse vectors
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
df = spark.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                 inputCol="vector", outputCol="transformedVector")
# Batch transform the vectors to create new column:
transformer.transform(df).show()

+-------------+-----------------+
|       vector|transformedVector|
+-------------+-----------------+
|[1.0,2.0,3.0]|    [0.0,2.0,6.0]|
|[4.0,5.0,6.0]|   [0.0,5.0,12.0]|
+-------------+-----------------+



In [61]:
df = spark.createDataFrame([
    (0, 1.0, 0.1, -1.0),
    (1, 2.0, 1.1, 1.0,),
    (2, 3.0, 10.1, 3.0)
], ["id", "a","b","c"])
df

DataFrame[id: bigint, a: double, b: double, c: double]

In [68]:
df.registerTempTable("test")
spark.sql("select a,a+b as a_b from test").collect()

[Row(a=1.0, a_b=1.1), Row(a=2.0, a_b=3.1), Row(a=3.0, a_b=13.1)]

In [70]:
df.withColumn("a_b",df.a+df.b).collect()

[Row(id=0, a=1.0, b=0.1, c=-1.0, a_b=1.1),
 Row(id=1, a=2.0, b=1.1, c=1.0, a_b=3.1),
 Row(id=2, a=3.0, b=10.1, c=3.0, a_b=13.1)]

In [71]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)

dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
     (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

sizeHint = VectorSizeHint(
    inputCol="userFeatures",
    handleInvalid="skip",
    size=3)

datasetWithSize = sizeHint.transform(dataset)
print("Rows where 'userFeatures' is not the right size are filtered out")
datasetWithSize.show(truncate=False)

assembler = VectorAssembler(
    inputCols=["hour", "mobile", "userFeatures"],
    outputCol="features")

# This dataframe can be used by downstream transformers as before
output = assembler.transform(datasetWithSize)
print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
output.select("features", "clicked").show(truncate=False)

Rows where 'userFeatures' is not the right size are filtered out
+---+----+------+--------------+-------+
|id |hour|mobile|userFeatures  |clicked|
+---+----+------+--------------+-------+
|0  |18  |1.0   |[0.0,10.0,0.5]|1.0    |
+---+----+------+--------------+-------+

Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'
+-----------------------+-------+
|features               |clicked|
+-----------------------+-------+
|[18.0,1.0,0.0,10.0,0.5]|1.0    |
+-----------------------+-------+



In [72]:
from pyspark.ml.feature import QuantileDiscretizer

data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]
df = spark.createDataFrame(data, ["id", "hour"])

discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result")

result = discretizer.fit(df).transform(df)
result.show()

+---+----+------+
| id|hour|result|
+---+----+------+
|  0|18.0|   2.0|
|  1|19.0|   2.0|
|  2| 8.0|   1.0|
|  3| 5.0|   1.0|
|  4| 2.2|   0.0|
+---+----+------+



In [73]:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

df = spark.createDataFrame([
    Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3})),
    Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))])

slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1])

output = slicer.transform(df)

output.select("userFeatures", "features").show()

+--------------------+-------------+
|        userFeatures|     features|
+--------------------+-------------+
|(3,[0,1],[-2.0,2.3])|(1,[0],[2.3])|
|      [-2.0,2.3,0.0]|        [2.3]|
+--------------------+-------------+



In [88]:
from pyspark.ml.feature import RFormula

dataset = spark.createDataFrame(
    [(7, "US", 18, 1.0),
     (8, "CA", 12, 0.0),
     (9, "NZ", 15, 0.0),
     (10, "JP", 18, 1.0),
     (11, "CN", 13, 0.0)],
    ["id", "country", "hour", "clicked"])

formula = RFormula(
    formula="clicked ~ country + hour -1",
    featuresCol="features",
    labelCol="label")

output = formula.fit(dataset).transform(dataset)
output.select("features", "label").show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(6,[1,5],[1.0,18.0])|  1.0|
|(6,[2,5],[1.0,12.0])|  0.0|
|(6,[0,5],[1.0,15.0])|  0.0|
|(6,[3,5],[1.0,18.0])|  1.0|
|(6,[4,5],[1.0,13.0])|  0.0|
+--------------------+-----+



In [86]:
output.collect()

[Row(id=7, country='US', hour=18, clicked=1.0, features=SparseVector(5, {1: 1.0, 4: 18.0}), label=1.0),
 Row(id=8, country='CA', hour=12, clicked=0.0, features=SparseVector(5, {2: 1.0, 4: 12.0}), label=0.0),
 Row(id=9, country='NZ', hour=15, clicked=0.0, features=SparseVector(5, {0: 1.0, 4: 15.0}), label=0.0),
 Row(id=10, country='JP', hour=18, clicked=1.0, features=SparseVector(5, {3: 1.0, 4: 18.0}), label=1.0),
 Row(id=11, country='CN', hour=13, clicked=0.0, features=SparseVector(5, {4: 13.0}), label=0.0)]

In [94]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

#indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
#indexed = indexer.fit(df).transform(df)

encoder = OneHotEncoderEstimator(inputCols="category",outputCols="onehot")
#indexed = indexer.fit(indexed).transform(indexed)
indexed = encoder.fit(df).transform(df)
indexed.show()

TypeError: Invalid param value given for param "inputCols". Could not convert category to list of strings

In [99]:
from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

df2 = spark.createDataFrame([
    (0.0, "a"),
    (1.0, "b"),
    (2.0, "c"),
    (0.0, "a"),
    (0.0, "a"),
    (2.0, "b")
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                 outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [98]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
    (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
    (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=1, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="clicked")

result = selector.fit(df).transform(df)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()

ChiSqSelector output with top 1 features selected
+---+------------------+-------+----------------+
| id|          features|clicked|selectedFeatures|
+---+------------------+-------+----------------+
|  7|[0.0,0.0,18.0,1.0]|    1.0|          [18.0]|
|  8|[0.0,1.0,12.0,0.0]|    0.0|          [12.0]|
|  9|[1.0,0.0,15.0,0.1]|    0.0|          [15.0]|
+---+------------------+-------+----------------+



In [100]:
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

dataA = [(0, Vectors.dense([1.0, 1.0]),),
         (1, Vectors.dense([1.0, -1.0]),),
         (2, Vectors.dense([-1.0, -1.0]),),
         (3, Vectors.dense([-1.0, 1.0]),)]
dfA = spark.createDataFrame(dataA, ["id", "features"])

dataB = [(4, Vectors.dense([1.0, 0.0]),),
         (5, Vectors.dense([-1.0, 0.0]),),
         (6, Vectors.dense([0.0, 1.0]),),
         (7, Vectors.dense([0.0, -1.0]),)]
dfB = spark.createDataFrame(dataB, ["id", "features"])

key = Vectors.dense([1.0, 0.0])

brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", bucketLength=2.0,
                                  numHashTables=3)
model = brp.fit(dfA)

# Feature Transformation
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(dfA).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
print("Approximately joining dfA and dfB on Euclidean distance smaller than 1.5:")
model.approxSimilarityJoin(dfA, dfB, 1.5, distCol="EuclideanDistance")\
    .select(col("datasetA.id").alias("idA"),
            col("datasetB.id").alias("idB"),
            col("EuclideanDistance")).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate nearest
# neighbor search.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxNearestNeighbors(transformedA, key, 2)`
print("Approximately searching dfA for 2 nearest neighbors of the key:")
model.approxNearestNeighbors(dfA, key, 2).show()

The hashed dataset where hashed values are stored in the column 'hashes':
+---+-----------+--------------------+
| id|   features|              hashes|
+---+-----------+--------------------+
|  0|  [1.0,1.0]|[[-1.0], [-1.0], ...|
|  1| [1.0,-1.0]|[[-1.0], [0.0], [...|
|  2|[-1.0,-1.0]|[[0.0], [0.0], [-...|
|  3| [-1.0,1.0]|[[0.0], [-1.0], [...|
+---+-----------+--------------------+

Approximately joining dfA and dfB on Euclidean distance smaller than 1.5:
+---+---+-----------------+
|idA|idB|EuclideanDistance|
+---+---+-----------------+
|  2|  7|              1.0|
|  2|  5|              1.0|
|  1|  7|              1.0|
|  0|  6|              1.0|
|  0|  4|              1.0|
|  3|  6|              1.0|
|  1|  4|              1.0|
|  3|  5|              1.0|
+---+---+-----------------+

Approximately searching dfA for 2 nearest neighbors of the key:
+---+----------+--------------------+-------+
| id|  features|              hashes|distCol|
+---+----------+--------------------+-------+


In [101]:
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
         (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
         (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]
dfA = spark.createDataFrame(dataA, ["id", "features"])

dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
         (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
         (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
dfB = spark.createDataFrame(dataB, ["id", "features"])

key = Vectors.sparse(6, [1, 3], [1.0, 1.0])

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
model = mh.fit(dfA)

# Feature Transformation
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(dfA).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 0.6)`
print("Approximately joining dfA and dfB on distance smaller than 0.6:")
model.approxSimilarityJoin(dfA, dfB, 0.6, distCol="JaccardDistance")\
    .select(col("datasetA.id").alias("idA"),
            col("datasetB.id").alias("idB"),
            col("JaccardDistance")).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate nearest
# neighbor search.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxNearestNeighbors(transformedA, key, 2)`
# It may return less than 2 rows when not enough approximate near-neighbor candidates are
# found.
print("Approximately searching dfA for 2 nearest neighbors of the key:")
model.approxNearestNeighbors(dfA, key, 2).show()

The hashed dataset where hashed values are stored in the column 'hashes':
+---+--------------------+--------------------+
| id|            features|              hashes|
+---+--------------------+--------------------+
|  0|(6,[0,1,2],[1.0,1...|[[6.00386719E8], ...|
|  1|(6,[2,3,4],[1.0,1...|[[-1.615482585E9]...|
|  2|(6,[0,2,4],[1.0,1...|[[-1.615482585E9]...|
+---+--------------------+--------------------+

Approximately joining dfA and dfB on distance smaller than 0.6:
+---+---+---------------+
|idA|idB|JaccardDistance|
+---+---+---------------+
|  1|  5|            0.5|
|  2|  5|            0.5|
|  1|  4|            0.5|
|  0|  5|            0.5|
+---+---+---------------+

Approximately searching dfA for 2 nearest neighbors of the key:
+---+--------------------+--------------------+-------+
| id|            features|              hashes|distCol|
+---+--------------------+--------------------+-------+
|  0|(6,[0,1,2],[1.0,1...|[[6.00386719E8], ...|   0.75|
|  1|(6,[2,3,4],[1.0,1...|[[