In [8]:
from pyspark.sql import SparkSession

In [9]:
spark=SparkSession\
.builder.\
appName("python spark sql example")\
.config("spark.some.config.option","some-value")\
.getOrCreate()

# Feature Extractors

### !!!hash算法

In [1]:
import hashlib

In [3]:
s='abc'

In [5]:
print(len(str(hash(s)))*4, hash(s))

80 -6100640383843701469


In [6]:
str(hash(s))

'-6100640383843701469'

##  1、TF-IDF

In [7]:
from pyspark.ml.feature import HashingTF,IDF,Tokenizer

In [10]:
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

In [11]:
sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [12]:
tokenizer=Tokenizer(inputCol="sentence",outputCol="words")

In [13]:
wordsData=tokenizer.transform(sentenceData)

In [17]:
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [18]:
hashingTF=HashingTF(inputCol='words',outputCol='rawFeatures',numFeatures=20)

In [19]:
featurizedData=hashingTF.transform(wordsData)

In [23]:
featurizedData.select("rawFeatures").show()

+--------------------+
|         rawFeatures|
+--------------------+
|(20,[0,5,9,17],[1...|
|(20,[2,7,9,13,15]...|
|(20,[4,6,13,15,18...|
+--------------------+



In [25]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [28]:
tf_idf=rescaledData.select("label", "features")

In [31]:
tf_idf.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[0,5,9,17],[0...|
|  0.0|(20,[2,7,9,13,15]...|
|  1.0|(20,[4,6,13,15,18...|
+-----+--------------------+



### 有spark的dataframe转化成pandas 的dataframe

In [29]:
pandas_df = tf_idf.toPandas()

In [37]:
import numpy as np
import pandas as pd

In [38]:
np.array(pandas_df['features'])

array([SparseVector(20, {0: 0.6931, 5: 0.6931, 9: 0.2877, 17: 1.3863}),
       SparseVector(20, {2: 0.6931, 7: 0.6931, 9: 0.863, 13: 0.2877, 15: 0.2877}),
       SparseVector(20, {4: 0.6931, 6: 0.6931, 13: 0.2877, 15: 0.2877, 18: 0.6931})],
      dtype=object)

### !!!sklearn tf-idf

In [39]:
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  

corpus=["I come to China to travel", 
    "This is a car polupar in China",          
    "I love tea and Apple ",   
    "The work is to write some papers in science"] 

vectorizer=CountVectorizer()

transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))  

In [58]:
print(tfidf)

  (0, 4)	0.4424621378947393
  (0, 15)	0.697684463383976
  (0, 3)	0.348842231691988
  (0, 16)	0.4424621378947393
  (1, 3)	0.3574550433419527
  (1, 14)	0.45338639737285463
  (1, 6)	0.3574550433419527
  (1, 2)	0.45338639737285463
  (1, 9)	0.45338639737285463
  (1, 5)	0.3574550433419527
  (2, 7)	0.5
  (2, 12)	0.5
  (2, 0)	0.5
  (2, 1)	0.5
  (3, 15)	0.2811316284405006
  (3, 6)	0.2811316284405006
  (3, 5)	0.2811316284405006
  (3, 13)	0.3565798233381452
  (3, 17)	0.3565798233381452
  (3, 18)	0.3565798233381452
  (3, 11)	0.3565798233381452
  (3, 8)	0.3565798233381452
  (3, 10)	0.3565798233381452


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer()
re = tfidf2.fit_transform(corpus)

In [59]:
re.data

array([0.44246214, 0.69768446, 0.34884223, 0.44246214, 0.35745504,
       0.4533864 , 0.35745504, 0.4533864 , 0.4533864 , 0.35745504,
       0.5       , 0.5       , 0.5       , 0.5       , 0.28113163,
       0.28113163, 0.28113163, 0.35657982, 0.35657982, 0.35657982,
       0.35657982, 0.35657982, 0.35657982])

##  2、word2Vec

In [60]:
from pyspark.ml.feature import Word2Vec

In [61]:
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

In [63]:
documentDF.show()

+--------------------+
|                text|
+--------------------+
|[Hi, I, heard, ab...|
|[I, wish, Java, c...|
|[Logistic, regres...|
+--------------------+



In [62]:
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")

In [64]:
model=word2Vec.fit(documentDF)

In [65]:
result = model.transform(documentDF)

In [66]:
result.collect()

[Row(text=['Hi', 'I', 'heard', 'about', 'Spark'], result=DenseVector([-0.0523, -0.0198, 0.0321])),
 Row(text=['I', 'wish', 'Java', 'could', 'use', 'case', 'classes'], result=DenseVector([-0.0546, -0.0193, -0.0027])),
 Row(text=['Logistic', 'regression', 'models', 'are', 'neat'], result=DenseVector([0.031, -0.0514, 0.018]))]

In [67]:
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [-0.052312071621418,-0.01982123963534832,0.0320683479309082]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.05457880547536271,-0.019253329506942203,-0.0027129666746727055]

Text: [Logistic, regression, models, are, neat] => 
Vector: [0.030980017594993116,-0.051439860463142396,0.018004290759563446]



## 3、CountVectorizer

In [68]:
from pyspark.ml.feature import CountVectorizer

In [69]:
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

In [70]:
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

In [71]:
model = cv.fit(df)

In [72]:
result=model.transform(df)

In [74]:
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



In [75]:
result.toPandas()

Unnamed: 0,id,words,features
0,0,"[a, b, c]","(1.0, 1.0, 1.0)"
1,1,"[a, b, b, c, a]","(2.0, 2.0, 1.0)"


## 4、FeatureHasher

In [76]:
from pyspark.ml.feature import FeatureHasher

In [77]:
dataset = spark.createDataFrame([
    (2.2, True, "1", "foo"),
    (3.3, False, "2", "bar"),
    (4.4, False, "3", "baz"),
    (5.5, False, "4", "foo")
], ["real", "bool", "stringNum", "string"])

In [78]:
dataset.show()

+----+-----+---------+------+
|real| bool|stringNum|string|
+----+-----+---------+------+
| 2.2| true|        1|   foo|
| 3.3|false|        2|   bar|
| 4.4|false|        3|   baz|
| 5.5|false|        4|   foo|
+----+-----+---------+------+



In [79]:
hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                       outputCol="features")

In [80]:
featurized = hasher.transform(dataset)

In [81]:
featurized.show(truncate=False)

+----+-----+---------+------+--------------------------------------------------------+
|real|bool |stringNum|string|features                                                |
+----+-----+---------+------+--------------------------------------------------------+
|2.2 |true |1        |foo   |(262144,[174475,247670,257907,262126],[2.2,1.0,1.0,1.0])|
|3.3 |false|2        |bar   |(262144,[70644,89673,173866,174475],[1.0,1.0,1.0,3.3])  |
|4.4 |false|3        |baz   |(262144,[22406,70644,174475,187923],[1.0,1.0,4.4,1.0])  |
|5.5 |false|4        |foo   |(262144,[70644,101499,174475,257907],[1.0,1.0,5.5,1.0]) |
+----+-----+---------+------+--------------------------------------------------------+



In [85]:
df=featurized.toPandas()

In [97]:
df

Unnamed: 0,real,bool,stringNum,string,features
0,2.2,True,1,foo,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,3.3,False,2,bar,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,4.4,False,3,baz,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,5.5,False,4,foo,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Feature Transformers

## Tokenizer

In [98]:
from pyspark.ml.feature import Tokenizer,RegexTokenizer

In [99]:
from pyspark.sql.functions import col,udf

In [100]:
from pyspark.sql.types import IntegerType

In [101]:
sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I wish Java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], ["id", "sentence"])

In [102]:
tokenizer=Tokenizer(inputCol="sentence",outputCol="words")

In [103]:
regexTokenizer=RegexTokenizer(inputCol="sentence",outputCol="words",pattern="\\W")

In [104]:
countTokens=udf(lambda words:len(words),IntegerType())

### 英文里面默认是空格分词

In [105]:
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words")\
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+-----------------------------------+------------------------------------------+------+
|sentence                           |words                                     |tokens|
+-----------------------------------+------------------------------------------+------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]              |5     |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|Logistic,regression,models,are,neat|[logistic,regression,models,are,neat]     |1     |
+-----------------------------------+------------------------------------------+------+



In [106]:
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

+-----------------------------------+------------------------------------------+------+
|sentence                           |words                                     |tokens|
+-----------------------------------+------------------------------------------+------+
|Hi I heard about Spark             |[hi, i, heard, about, spark]              |5     |
|I wish Java could use case classes |[i, wish, java, could, use, case, classes]|7     |
|Logistic,regression,models,are,neat|[logistic, regression, models, are, neat] |5     |
+-----------------------------------+------------------------------------------+------+



## stopwordsremover

In [109]:
from pyspark.ml.feature import StopWordsRemover

In [110]:
sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

In [111]:
remover=StopWordsRemover(inputCol="raw",outputCol="filtered")

In [112]:
remover.transform(sentenceData).show(truncate=False)

+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



## n-gram

In [113]:
from pyspark.ml.feature import NGram

In [119]:
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

In [120]:
ngram=NGram(n=3,inputCol="words",outputCol="ngrams")

In [121]:
ngramDataFrame=ngram.transform(wordDataFrame)

In [122]:
ngramDataFrame.show(truncate=False)

+---+------------------------------------------+--------------------------------------------------------------------------------+
|id |words                                     |ngrams                                                                          |
+---+------------------------------------------+--------------------------------------------------------------------------------+
|0  |[Hi, I, heard, about, Spark]              |[Hi I heard, I heard about, heard about Spark]                                  |
|1  |[I, wish, Java, could, use, case, classes]|[I wish Java, wish Java could, Java could use, could use case, use case classes]|
|2  |[Logistic, regression, models, are, neat] |[Logistic regression models, regression models are, models are neat]            |
+---+------------------------------------------+--------------------------------------------------------------------------------+



## Binarizer

In [123]:
from pyspark.ml.feature import Binarizer

In [124]:
continuousDataFrame = spark.createDataFrame([
    (0, 0.1),
    (1, 0.8),
    (2, 0.2)
], ["id", "feature"])

In [125]:
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

In [126]:
binarizedDataFrame = binarizer.transform(continuousDataFrame)

In [127]:
print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

Binarizer output with Threshold = 0.500000
+---+-------+-----------------+
| id|feature|binarized_feature|
+---+-------+-----------------+
|  0|    0.1|              0.0|
|  1|    0.8|              1.0|
|  2|    0.2|              0.0|
+---+-------+-----------------+



## PCA

In [128]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

In [129]:
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
df = spark.createDataFrame(data, ["features"])

In [132]:
df.toPandas()

Unnamed: 0,features
0,"(0.0, 1.0, 0.0, 7.0, 0.0)"
1,"[2.0, 0.0, 3.0, 4.0, 5.0]"
2,"[4.0, 0.0, 0.0, 6.0, 7.0]"


In [133]:
pca=PCA(k=3,inputCol="features",outputCol="pcaFeatures")

In [134]:
model=pca.fit(df)

In [135]:
result = model.transform(df)

In [140]:
list(result.toPandas()['pcaFeatures'])

[DenseVector([1.6486, -4.0133, -5.5245]),
 DenseVector([-4.6451, -1.1168, -5.5245]),
 DenseVector([-6.4289, -5.338, -5.5245])]

In [141]:
result.show(truncate=False)

+---------------------+-----------------------------------------------------------+
|features             |pcaFeatures                                                |
+---------------------+-----------------------------------------------------------+
|(5,[1,3],[1.0,7.0])  |[1.6485728230883807,-4.013282700516296,-5.524543751369388] |
|[2.0,0.0,3.0,4.0,5.0]|[-4.645104331781534,-1.1167972663619026,-5.524543751369387]|
|[4.0,0.0,0.0,6.0,7.0]|[-6.428880535676489,-5.337951427775355,-5.524543751369389] |
+---------------------+-----------------------------------------------------------+



## PolynomialExpansion

In [142]:
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

In [143]:
df = spark.createDataFrame([
    (Vectors.dense([2.0, 1.0]),),
    (Vectors.dense([0.0, 0.0]),),
    (Vectors.dense([3.0, -1.0]),)
], ["features"])

In [147]:
polyExpansion=PolynomialExpansion(degree=2,inputCol="features",outputCol="polyFeatures")

In [148]:
polyDF = polyExpansion.transform(df)

In [149]:
polyDF.show(truncate=False)

+----------+-----------------------+
|features  |polyFeatures           |
+----------+-----------------------+
|[2.0,1.0] |[2.0,4.0,1.0,2.0,1.0]  |
|[0.0,0.0] |[0.0,0.0,0.0,0.0,0.0]  |
|[3.0,-1.0]|[3.0,9.0,-1.0,-3.0,1.0]|
+----------+-----------------------+



## Discrete Cosine Transform

In [150]:
from pyspark.ml.feature import DCT

In [151]:
from pyspark.ml.linalg import Vectors

In [152]:
df = spark.createDataFrame([
    (Vectors.dense([0.0, 1.0, -2.0, 3.0]),),
    (Vectors.dense([-1.0, 2.0, 4.0, -7.0]),),
    (Vectors.dense([14.0, -2.0, -5.0, 1.0]),)], ["features"])

In [160]:
df.show()

+--------------------+
|            features|
+--------------------+
|  [0.0,1.0,-2.0,3.0]|
| [-1.0,2.0,4.0,-7.0]|
|[14.0,-2.0,-5.0,1.0]|
+--------------------+



In [153]:
dct=DCT(inverse=False,inputCol="features",outputCol="featuresDCT")

In [158]:
dctDf=dct.transform(df)

In [159]:
dctDf.select("featuresDCT").show(truncate=False)

+----------------------------------------------------------------+
|featuresDCT                                                     |
+----------------------------------------------------------------+
|[1.0,-1.1480502970952693,2.0000000000000004,-2.7716385975338604]|
|[-1.0,3.378492794482933,-7.000000000000001,2.9301512653149677]  |
|[4.0,9.304453421915744,11.000000000000002,1.5579302036357163]   |
+----------------------------------------------------------------+



## StringIndexer

In [161]:
from pyspark.ml.feature import StringIndexer

In [162]:
df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

In [163]:
df.show()

+---+--------+
| id|category|
+---+--------+
|  0|       a|
|  1|       b|
|  2|       c|
|  3|       a|
|  4|       a|
|  5|       c|
+---+--------+



In [164]:
indexer=StringIndexer(inputCol="category",outputCol="categoryIndex")

In [165]:
indexed = indexer.fit(df).transform(df)

In [166]:
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



## IndexToString

In [167]:
from pyspark.ml.feature import IndexToString,StringIndexer

In [168]:
df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"])

In [169]:
df.show()

+---+--------+
| id|category|
+---+--------+
|  0|       a|
|  1|       b|
|  2|       c|
|  3|       a|
|  4|       a|
|  5|       c|
+---+--------+



In [170]:
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed = model.transform(df)

In [172]:
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



In [174]:
converter=IndexToString(inputCol="categoryIndex",outputCol="originalCategory")

In [175]:
converted=converter.transform(indexed)

In [176]:
converted.show()

+---+--------+-------------+----------------+
| id|category|categoryIndex|originalCategory|
+---+--------+-------------+----------------+
|  0|       a|          0.0|               a|
|  1|       b|          2.0|               b|
|  2|       c|          1.0|               c|
|  3|       a|          0.0|               a|
|  4|       a|          0.0|               a|
|  5|       c|          1.0|               c|
+---+--------+-------------+----------------+



## one-hot

In [177]:
from pyspark.ml.feature import OneHotEncoderEstimator

df = spark.createDataFrame([
    (0.0, 1.0),
    (1.0, 0.0),
    (2.0, 1.0),
    (0.0, 2.0),
    (0.0, 1.0),
    (2.0, 0.0)
], ["categoryIndex1", "categoryIndex2"])

encoder = OneHotEncoderEstimator(inputCols=["categoryIndex1", "categoryIndex2"],
                                 outputCols=["categoryVec1", "categoryVec2"])
model = encoder.fit(df)
encoded = model.transform(df)
encoded.show()

+--------------+--------------+-------------+-------------+
|categoryIndex1|categoryIndex2| categoryVec1| categoryVec2|
+--------------+--------------+-------------+-------------+
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           1.0|           0.0|(2,[1],[1.0])|(2,[0],[1.0])|
|           2.0|           1.0|    (2,[],[])|(2,[1],[1.0])|
|           0.0|           2.0|(2,[0],[1.0])|    (2,[],[])|
|           0.0|           1.0|(2,[0],[1.0])|(2,[1],[1.0])|
|           2.0|           0.0|    (2,[],[])|(2,[0],[1.0])|
+--------------+--------------+-------------+-------------+



In [178]:
encoded.toPandas()

Unnamed: 0,categoryIndex1,categoryIndex2,categoryVec1,categoryVec2
0,0.0,1.0,"(1.0, 0.0)","(0.0, 1.0)"
1,1.0,0.0,"(0.0, 1.0)","(1.0, 0.0)"
2,2.0,1.0,"(0.0, 0.0)","(0.0, 1.0)"
3,0.0,2.0,"(1.0, 0.0)","(0.0, 0.0)"
4,0.0,1.0,"(1.0, 0.0)","(0.0, 1.0)"
5,2.0,0.0,"(0.0, 0.0)","(1.0, 0.0)"


## VectorIndexer

In [179]:
from pyspark.ml.feature import VectorIndexer

In [181]:
data=spark.read.format('libsvm').load('data/mllib/sample_libsvm_data.txt')

In [186]:
# data.toPandas()

In [182]:
indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10)

In [183]:
indexerModel=indexer.fit(data)

In [193]:
categoricalFeatures = indexerModel.categoryMaps

In [194]:
print("Chose %d categorical features: %s" %
      (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys())))

Chose 351 categorical features: 645, 69, 365, 138, 101, 479, 333, 249, 0, 555, 666, 88, 170, 115, 276, 308, 5, 449, 120, 247, 614, 677, 202, 10, 56, 533, 142, 500, 340, 670, 174, 42, 417, 24, 37, 25, 257, 389, 52, 14, 504, 110, 587, 619, 196, 559, 638, 20, 421, 46, 93, 284, 228, 448, 57, 78, 29, 475, 164, 591, 646, 253, 106, 121, 84, 480, 147, 280, 61, 221, 396, 89, 133, 116, 1, 507, 312, 74, 307, 452, 6, 248, 60, 117, 678, 529, 85, 201, 220, 366, 534, 102, 334, 28, 38, 561, 392, 70, 424, 192, 21, 137, 165, 33, 92, 229, 252, 197, 361, 65, 97, 665, 583, 285, 224, 650, 615, 9, 53, 169, 593, 141, 610, 420, 109, 256, 225, 339, 77, 193, 669, 476, 642, 637, 590, 679, 96, 393, 647, 173, 13, 41, 503, 134, 73, 105, 2, 508, 311, 558, 674, 530, 586, 618, 166, 32, 34, 148, 45, 161, 279, 64, 689, 17, 149, 584, 562, 176, 423, 191, 22, 44, 59, 118, 281, 27, 641, 71, 391, 12, 445, 54, 313, 611, 144, 49, 335, 86, 672, 172, 113, 681, 219, 419, 81, 230, 362, 451, 76, 7, 39, 649, 98, 616, 477, 367, 535, 1

In [195]:
indexedData = indexerModel.transform(data)
indexedData.show()

+-----+--------------------+--------------------+
|label|            features|             indexed|
+-----+--------------------+--------------------+
|  0.0|(692,[127,128,129...|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|(692,[124,125,126...|


## normalizer

In [196]:
from pyspark.ml.feature import Normalizer

In [197]:
from pyspark.ml.linalg import Vectors

In [209]:
dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.5, -1.0]),),
    (1, Vectors.dense([2.0, 1.0, 1.0]),),
    (2, Vectors.dense([4.0, 10.0, 2.0]),)
], ["id", "features"])

In [213]:
normalizer=Normalizer(inputCol="features",outputCol="normFeatures",p=1.0)

In [214]:
l1NormData=normalizer.transform(dataFrame)

In [215]:
l1NormData.show()

+---+--------------+------------------+
| id|      features|      normFeatures|
+---+--------------+------------------+
|  0|[1.0,0.5,-1.0]|    [0.4,0.2,-0.4]|
|  1| [2.0,1.0,1.0]|   [0.5,0.25,0.25]|
|  2|[4.0,10.0,2.0]|[0.25,0.625,0.125]|
+---+--------------+------------------+



## StandardScaler

In [216]:
from pyspark.ml.feature import StandardScaler

In [217]:
dataFrame = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

In [220]:
scaler=StandardScaler(inputCol="features",outputCol="scaledFeatures",withStd=True,withMean=False)

In [221]:
scalerModel=scaler.fit(dataFrame)

In [222]:
scaledData=scalerModel.transform(dataFrame)

In [229]:
dff=scaledData.toPandas()

In [234]:
list(dff['features'].head(1))

[SparseVector(692, {127: 51.0, 128: 159.0, 129: 253.0, 130: 159.0, 131: 50.0, 154: 48.0, 155: 238.0, 156: 252.0, 157: 252.0, 158: 252.0, 159: 237.0, 181: 54.0, 182: 227.0, 183: 253.0, 184: 252.0, 185: 239.0, 186: 233.0, 187: 252.0, 188: 57.0, 189: 6.0, 207: 10.0, 208: 60.0, 209: 224.0, 210: 252.0, 211: 253.0, 212: 252.0, 213: 202.0, 214: 84.0, 215: 252.0, 216: 253.0, 217: 122.0, 235: 163.0, 236: 252.0, 237: 252.0, 238: 252.0, 239: 253.0, 240: 252.0, 241: 252.0, 242: 96.0, 243: 189.0, 244: 253.0, 245: 167.0, 262: 51.0, 263: 238.0, 264: 253.0, 265: 253.0, 266: 190.0, 267: 114.0, 268: 253.0, 269: 228.0, 270: 47.0, 271: 79.0, 272: 255.0, 273: 168.0, 289: 48.0, 290: 238.0, 291: 252.0, 292: 252.0, 293: 179.0, 294: 12.0, 295: 75.0, 296: 121.0, 297: 21.0, 300: 253.0, 301: 243.0, 302: 50.0, 316: 38.0, 317: 165.0, 318: 253.0, 319: 233.0, 320: 208.0, 321: 84.0, 328: 253.0, 329: 252.0, 330: 165.0, 343: 7.0, 344: 178.0, 345: 252.0, 346: 240.0, 347: 71.0, 348: 19.0, 349: 28.0, 356: 253.0, 357: 252.0

In [233]:
list(dff['scaledFeatures'].head(1))

[SparseVector(692, {127: 0.5468, 128: 1.5923, 129: 2.4354, 130: 1.7081, 131: 0.7335, 154: 0.4346, 155: 2.0985, 156: 2.2563, 157: 2.2368, 158: 2.2269, 159: 2.2555, 181: 0.4713, 182: 2.0575, 183: 2.3318, 184: 2.3761, 185: 2.1237, 186: 2.0452, 187: 2.2657, 188: 0.6339, 189: 0.1022, 207: 0.1056, 208: 0.5395, 209: 1.9268, 210: 2.2383, 211: 2.3018, 212: 2.3568, 213: 1.8002, 214: 0.7116, 215: 2.2256, 216: 2.4032, 217: 1.5931, 235: 1.5394, 236: 2.188, 237: 2.1493, 238: 2.2924, 239: 2.3889, 240: 2.3155, 241: 2.2653, 242: 0.8445, 243: 1.7094, 244: 2.2496, 245: 1.8613, 262: 0.5062, 263: 2.0796, 264: 2.2201, 265: 2.199, 266: 1.7299, 267: 1.083, 268: 2.1786, 269: 2.0345, 270: 0.4392, 271: 0.7218, 272: 2.2177, 273: 1.6764, 289: 0.4794, 290: 2.214, 291: 2.3569, 292: 2.2283, 293: 1.6322, 294: 0.1087, 295: 0.6833, 296: 1.0411, 297: 0.1941, 300: 2.277, 301: 2.3083, 302: 0.5395, 316: 0.3967, 317: 1.6059, 318: 2.3539, 319: 2.1535, 320: 1.9834, 321: 0.8017, 328: 2.4941, 329: 2.3661, 330: 1.7473, 343: 0.076

## MinMaxScaler

In [235]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors

In [236]:
dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -1.0]),),
    (1, Vectors.dense([2.0, 1.1, 1.0]),),
    (2, Vectors.dense([3.0, 10.1, 3.0]),)
], ["id", "features"])

In [237]:
dataFrame.show()

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  2|[3.0,10.1,3.0]|
+---+--------------+



In [238]:
scaler=MinMaxScaler(inputCol="features",outputCol="scaledFeatures")

In [239]:
scalerModel=scaler.fit(dataFrame)

In [240]:
scaledData=scalerModel.transform(dataFrame)

In [241]:
scaledData.show()

+---+--------------+--------------+
| id|      features|scaledFeatures|
+---+--------------+--------------+
|  0|[1.0,0.1,-1.0]| [0.0,0.0,0.0]|
|  1| [2.0,1.1,1.0]| [0.5,0.1,0.5]|
|  2|[3.0,10.1,3.0]| [1.0,1.0,1.0]|
+---+--------------+--------------+



## MaxAbsScaler

In [242]:
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors

In [243]:
dataFrame = spark.createDataFrame([
    (0, Vectors.dense([1.0, 0.1, -8.0]),),
    (1, Vectors.dense([2.0, 1.0, -4.0]),),
    (2, Vectors.dense([4.0, 10.0, 8.0]),)
], ["id", "features"])

In [244]:
scaler=MaxAbsScaler(inputCol="features",outputCol="scaledFeatures")

In [245]:
scalerModel=scaler.fit(dataFrame)

In [246]:
scaledData = scalerModel.transform(dataFrame)

scaledData.select("features", "scaledFeatures").show()

+--------------+----------------+
|      features|  scaledFeatures|
+--------------+----------------+
|[1.0,0.1,-8.0]|[0.25,0.01,-1.0]|
|[2.0,1.0,-4.0]|  [0.5,0.1,-0.5]|
|[4.0,10.0,8.0]|   [1.0,1.0,1.0]|
+--------------+----------------+



## Bucketizer

In [247]:
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

Bucketizer output with 4 buckets
+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+



## ElementwiseProduct

In [248]:
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

In [249]:
data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]

In [251]:
print(data)

[(DenseVector([1.0, 2.0, 3.0]),), (DenseVector([4.0, 5.0, 6.0]),)]


In [252]:
df = spark.createDataFrame(data, ["vector"])

In [253]:
df.show()

+-------------+
|       vector|
+-------------+
|[1.0,2.0,3.0]|
|[4.0,5.0,6.0]|
+-------------+



In [254]:
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                 inputCol="vector", outputCol="transformedVector")

In [255]:
transformer.transform(df).show()

+-------------+-----------------+
|       vector|transformedVector|
+-------------+-----------------+
|[1.0,2.0,3.0]|    [0.0,2.0,6.0]|
|[4.0,5.0,6.0]|   [0.0,5.0,12.0]|
+-------------+-----------------+



## SQLTransformer

In [256]:
from pyspark.ml.feature import SQLTransformer

In [257]:
df = spark.createDataFrame([
    (0, 1.0, 3.0),
    (2, 2.0, 5.0)
], ["id", "v1", "v2"])

In [258]:
df.show()

+---+---+---+
| id| v1| v2|
+---+---+---+
|  0|1.0|3.0|
|  2|2.0|5.0|
+---+---+---+



In [259]:
sqlTrans = SQLTransformer(
    statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")


In [261]:
sqlTrans.transform(df).show()

+---+---+---+---+----+
| id| v1| v2| v3|  v4|
+---+---+---+---+----+
|  0|1.0|3.0|4.0| 3.0|
|  2|2.0|5.0|7.0|10.0|
+---+---+---+---+----+



## VectorAssembler

In [262]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [263]:
dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

In [264]:
dataset.show()

+---+----+------+--------------+-------+
| id|hour|mobile|  userFeatures|clicked|
+---+----+------+--------------+-------+
|  0|  18|   1.0|[0.0,10.0,0.5]|    1.0|
+---+----+------+--------------+-------+



In [265]:
assembler = VectorAssembler(
    inputCols=["hour", "mobile", "userFeatures"],
    outputCol="features")

In [266]:
output = assembler.transform(dataset)

In [267]:
output.show()

+---+----+------+--------------+-------+--------------------+
| id|hour|mobile|  userFeatures|clicked|            features|
+---+----+------+--------------+-------+--------------------+
|  0|  18|   1.0|[0.0,10.0,0.5]|    1.0|[18.0,1.0,0.0,10....|
+---+----+------+--------------+-------+--------------------+



## QuantileDiscretizer

In [268]:
from pyspark.ml.feature import QuantileDiscretizer

In [269]:
data=[(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2)]

In [270]:
df = spark.createDataFrame(data, ["id", "hour"])

In [271]:
df.show()

+---+----+
| id|hour|
+---+----+
|  0|18.0|
|  1|19.0|
|  2| 8.0|
|  3| 5.0|
|  4| 2.2|
+---+----+



In [272]:
discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result")

In [273]:
result = discretizer.fit(df).transform(df)

In [274]:
result.show()

+---+----+------+
| id|hour|result|
+---+----+------+
|  0|18.0|   2.0|
|  1|19.0|   2.0|
|  2| 8.0|   1.0|
|  3| 5.0|   1.0|
|  4| 2.2|   0.0|
+---+----+------+



## Imputer

In [275]:
from pyspark.ml.feature import Imputer

In [276]:
df = spark.createDataFrame([
    (1.0, float("nan")),
    (2.0, float("nan")),
    (float("nan"), 3.0),
    (4.0, 4.0),
    (5.0, 5.0)
], ["a", "b"])

In [277]:
imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])

In [278]:
model = imputer.fit(df)

In [279]:
model.transform(df).show()

+---+---+-----+-----+
|  a|  b|out_a|out_b|
+---+---+-----+-----+
|1.0|NaN|  1.0|  4.0|
|2.0|NaN|  2.0|  4.0|
|NaN|3.0|  3.0|  3.0|
|4.0|4.0|  4.0|  4.0|
|5.0|5.0|  5.0|  5.0|
+---+---+-----+-----+



# Feature Selectors

## VectorSlicer

In [283]:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

In [284]:
df = spark.createDataFrame([
    Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3})),
    Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))])

In [286]:
df.toPandas()

Unnamed: 0,userFeatures
0,"(-2.0, 2.3, 0.0)"
1,"[-2.0, 2.3, 0.0]"


In [291]:
slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1,2])

In [292]:
output = slicer.transform(df)

In [293]:
output.toPandas()

Unnamed: 0,userFeatures,features
0,"(-2.0, 2.3, 0.0)","(2.3, 0.0)"
1,"[-2.0, 2.3, 0.0]","[2.3, 0.0]"


In [294]:
output.show()

+--------------------+-------------+
|        userFeatures|     features|
+--------------------+-------------+
|(3,[0,1],[-2.0,2.3])|(2,[0],[2.3])|
|      [-2.0,2.3,0.0]|    [2.3,0.0]|
+--------------------+-------------+



## ChiSqSelector

In [295]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

In [296]:
df = spark.createDataFrame([
    (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
    (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
    (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"])

In [300]:
selector = ChiSqSelector(numTopFeatures=2, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="clicked")

In [301]:
result = selector.fit(df).transform(df)

In [302]:
result.show()

+---+------------------+-------+----------------+
| id|          features|clicked|selectedFeatures|
+---+------------------+-------+----------------+
|  7|[0.0,0.0,18.0,1.0]|    1.0|      [18.0,1.0]|
|  8|[0.0,1.0,12.0,0.0]|    0.0|      [12.0,0.0]|
|  9|[1.0,0.0,15.0,0.1]|    0.0|      [15.0,0.1]|
+---+------------------+-------+----------------+

