# Spark - Machine Learning

# Machine Learning

In [1]:
from pyspark.sql import SparkSession 
spark = SparkSession\
        .builder\
        .appName("Python")\
        .getOrCreate()

In [2]:
from pyspark.ml.linalg import Vectors
denseVec = Vectors.dense(1.0, 2.0, 3.0)
size = 3
idx = [1, 2] # locations of non-zero elements in vector
values = [2.0, 3.0]
sparseVec = Vectors.sparse(size, idx, values)

In [3]:
denseVec.values

array([1., 2., 3.])

In [4]:
sparseVec.toArray()

array([0., 2., 3.])

In [5]:
df = spark.read.json("data/simple-ml")
df.orderBy("value2").show(5)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|  red|good|    35|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|  red| bad|     2|14.386294994851129|
| blue| bad|     8|14.386294994851129|
|  red| bad|    16|14.386294994851129|
+-----+----+------+------------------+
only showing top 5 rows



In [6]:
from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")

In [7]:
fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show(5)

+-----+----+------+------------------+--------------------+-----+
|color| lab|value1|            value2|            features|label|
+-----+----+------+------------------+--------------------+-----+
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue| bad|    12|14.386294994851129|(10,[2,3,6,9],[12...|  0.0|
|green|good|    15| 38.97187133755819|(10,[1,2,3,5,8],[...|  1.0|
|green|good|    12|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
+-----+----+------+------------------+--------------------+-----+
only showing top 5 rows



In [8]:
train, test = preparedDF.randomSplit([0.7, 0.3])

In [9]:
train.show(5)

+-----+---+------+------------------+--------------------+-----+
|color|lab|value1|            value2|            features|label|
+-----+---+------+------------------+--------------------+-----+
| blue|bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue|bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue|bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue|bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue|bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
+-----+---+------+------------------+--------------------+-----+
only showing top 5 rows



In [10]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label",featuresCol="features")

In [11]:
type(train['features'])

pyspark.sql.column.Column

In [None]:
print (lr.explainParams())

In [12]:
fittedLR = lr.fit(train)

In [13]:
train, test = df.randomSplit([0.7, 0.3])

In [14]:
rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

In [15]:
from pyspark.ml import Pipeline
stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

In [16]:
from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()\
  .addGrid(rForm.formula, [
    "lab ~ . + color:value1",
    "lab ~ . + color:value1 + color:value2"])\
  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
  .addGrid(lr.regParam, [0.1, 2.0])\
  .build()

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
  .setMetricName("areaUnderROC")\
  .setRawPredictionCol("prediction")\
  .setLabelCol("label")

In [18]:
from pyspark.ml.tuning import TrainValidationSplit
tvs = TrainValidationSplit()\
  .setTrainRatio(0.75)\
  .setEstimatorParamMaps(params)\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)

In [19]:
tvsFitted = tvs.fit(train) # cria o modelo de Machine Lerning

In [20]:
evaluator.evaluate(tvsFitted.transform(test))

0.9583333333333333

## Preprocessing and Feature Engeneering

In [21]:
sales = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("data/retail-data/by-day/*.csv")\
  .coalesce(5)\
  .where("Description IS NOT NULL")
fakeIntDF = spark.read.parquet("../data/simple-ml-integers")
simpleDF = spark.read.json("../data/simple-ml")
scaleDF = spark.read.parquet("../data/simple-ml-scaling")

In [22]:
sales.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [23]:
sales.count()

540455

In [24]:
simpleDF.show(5)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|green|good|    15| 38.97187133755819|
|green|good|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 5 rows



In [25]:
fakeIntDF.show(5)

+----+----+----+
|int1|int2|int3|
+----+----+----+
|   1|   2|   3|
|   7|   8|   9|
|   4|   5|   6|
+----+----+----+



In [26]:
scaleDF.show(5)

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  1|[3.0,10.1,3.0]|
+---+--------------+



In [27]:
from pyspark.ml.feature import RFormula

supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")
supervised.fit(simpleDF).transform(simpleDF).show(5)

+-----+----+------+------------------+--------------------+-----+
|color| lab|value1|            value2|            features|label|
+-----+----+------+------------------+--------------------+-----+
|green|good|     1|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
| blue| bad|     8|14.386294994851129|(10,[2,3,6,9],[8....|  0.0|
| blue| bad|    12|14.386294994851129|(10,[2,3,6,9],[12...|  0.0|
|green|good|    15| 38.97187133755819|(10,[1,2,3,5,8],[...|  1.0|
|green|good|    12|14.386294994851129|(10,[1,2,3,5,8],[...|  1.0|
+-----+----+------+------------------+--------------------+-----+
only showing top 5 rows



In [28]:
from pyspark.ml.feature import SQLTransformer

basicTransformation = SQLTransformer()\
  .setStatement("""
    SELECT sum(Quantity), count(*), CustomerID
    FROM __THIS__
    GROUP BY CustomerID
  """)

basicTransformation.transform(sales).show(5)

+-------------+--------+----------+
|sum(Quantity)|count(1)|CustomerID|
+-------------+--------+----------+
|          119|      62|   14452.0|
|          440|     143|   16916.0|
|          630|      72|   17633.0|
|           34|       6|   14768.0|
|         1542|      30|   13094.0|
+-------------+--------+----------+
only showing top 5 rows



In [29]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler().setInputCols(["int1", "int2", "int3"])
va.transform(fakeIntDF).show()

+----+----+----+--------------------------------------------+
|int1|int2|int3|VectorAssembler_4807ac08488b444b2cc4__output|
+----+----+----+--------------------------------------------+
|   1|   2|   3|                               [1.0,2.0,3.0]|
|   7|   8|   9|                               [7.0,8.0,9.0]|
|   4|   5|   6|                               [4.0,5.0,6.0]|
+----+----+----+--------------------------------------------+



In [30]:
contDF = spark.range(20).selectExpr("cast(id as double)")
contDF.show(5)

+---+
| id|
+---+
|0.0|
|1.0|
|2.0|
|3.0|
|4.0|
+---+
only showing top 5 rows



In [31]:
from pyspark.ml.feature import Bucketizer
bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0]
bucketer = Bucketizer().setSplits(bucketBorders).setInputCol("id")
bucketer.transform(contDF).show()

+----+---------------------------------------+
|  id|Bucketizer_4e7fa983a35a09c51334__output|
+----+---------------------------------------+
| 0.0|                                    0.0|
| 1.0|                                    0.0|
| 2.0|                                    0.0|
| 3.0|                                    0.0|
| 4.0|                                    0.0|
| 5.0|                                    1.0|
| 6.0|                                    1.0|
| 7.0|                                    1.0|
| 8.0|                                    1.0|
| 9.0|                                    1.0|
|10.0|                                    2.0|
|11.0|                                    2.0|
|12.0|                                    2.0|
|13.0|                                    2.0|
|14.0|                                    2.0|
|15.0|                                    2.0|
|16.0|                                    2.0|
|17.0|                                    2.0|
|18.0|       

In [32]:
from pyspark.ml.feature import QuantileDiscretizer
bucketer = QuantileDiscretizer().setNumBuckets(5).setInputCol("id")
fittedBucketer = bucketer.fit(contDF)
fittedBucketer.transform(contDF).show()

+----+------------------------------------------------+
|  id|QuantileDiscretizer_4d1fb96190b2a520bd4c__output|
+----+------------------------------------------------+
| 0.0|                                             0.0|
| 1.0|                                             0.0|
| 2.0|                                             0.0|
| 3.0|                                             1.0|
| 4.0|                                             1.0|
| 5.0|                                             1.0|
| 6.0|                                             1.0|
| 7.0|                                             2.0|
| 8.0|                                             2.0|
| 9.0|                                             2.0|
|10.0|                                             2.0|
|11.0|                                             2.0|
|12.0|                                             3.0|
|13.0|                                             3.0|
|14.0|                                          

In [33]:
scaleDF.show(5)

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  1|[3.0,10.1,3.0]|
+---+--------------+



In [34]:
from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(scaleDF).transform(scaleDF).show()

+---+--------------+-------------------------------------------+
| id|      features|StandardScaler_4c3fbca61cb6ef3694b8__output|
+---+--------------+-------------------------------------------+
|  0|[1.0,0.1,-1.0]|                       [1.19522860933439...|
|  1| [2.0,1.1,1.0]|                       [2.39045721866878...|
|  0|[1.0,0.1,-1.0]|                       [1.19522860933439...|
|  1| [2.0,1.1,1.0]|                       [2.39045721866878...|
|  1|[3.0,10.1,3.0]|                       [3.58568582800318...|
+---+--------------+-------------------------------------------+



In [35]:
from pyspark.ml.feature import MinMaxScaler
minMax = MinMaxScaler().setMin(5).setMax(10).setInputCol("features")
fittedminMax = minMax.fit(scaleDF)
fittedminMax.transform(scaleDF).show()

+---+--------------+-----------------------------------------+
| id|      features|MinMaxScaler_44289355c35bb3be91d8__output|
+---+--------------+-----------------------------------------+
|  0|[1.0,0.1,-1.0]|                            [5.0,5.0,5.0]|
|  1| [2.0,1.1,1.0]|                            [7.5,5.5,7.5]|
|  0|[1.0,0.1,-1.0]|                            [5.0,5.0,5.0]|
|  1| [2.0,1.1,1.0]|                            [7.5,5.5,7.5]|
|  1|[3.0,10.1,3.0]|                         [10.0,10.0,10.0]|
+---+--------------+-----------------------------------------+



In [36]:
from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()

+---+--------------+-----------------------------------------+
| id|      features|MaxAbsScaler_4669bdbda04504bacac2__output|
+---+--------------+-----------------------------------------+
|  0|[1.0,0.1,-1.0]|                     [0.33333333333333...|
|  1| [2.0,1.1,1.0]|                     [0.66666666666666...|
|  0|[1.0,0.1,-1.0]|                     [0.33333333333333...|
|  1| [2.0,1.1,1.0]|                     [0.66666666666666...|
|  1|[3.0,10.1,3.0]|                            [1.0,1.0,1.0]|
+---+--------------+-----------------------------------------+



In [37]:
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()

+---+--------------+-----------------------------------------------+
| id|      features|ElementwiseProduct_485bb350325092783a70__output|
+---+--------------+-----------------------------------------------+
|  0|[1.0,0.1,-1.0]|                               [10.0,1.5,-20.0]|
|  1| [2.0,1.1,1.0]|                               [20.0,16.5,20.0]|
|  0|[1.0,0.1,-1.0]|                               [10.0,1.5,-20.0]|
|  1| [2.0,1.1,1.0]|                               [20.0,16.5,20.0]|
|  1|[3.0,10.1,3.0]|                              [30.0,151.5,60.0]|
+---+--------------+-----------------------------------------------+



In [40]:
from pyspark.ml.feature import Normalizer, StringIndexer
manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show()

+---+--------------+---------------------------------------+
| id|      features|Normalizer_42ef825fcb5f9158049b__output|
+---+--------------+---------------------------------------+
|  0|[1.0,0.1,-1.0]|                   [0.47619047619047...|
|  1| [2.0,1.1,1.0]|                   [0.48780487804878...|
|  0|[1.0,0.1,-1.0]|                   [0.47619047619047...|
|  1| [2.0,1.1,1.0]|                   [0.48780487804878...|
|  1|[3.0,10.1,3.0]|                   [0.18633540372670...|
+---+--------------+---------------------------------------+



In [41]:
valIndexer = StringIndexer().setInputCol("value1").setOutputCol("valueInd")
valIndexer.fit(simpleDF).transform(simpleDF).show()

+-----+----+------+------------------+--------+
|color| lab|value1|            value2|valueInd|
+-----+----+------+------------------+--------+
|green|good|     1|14.386294994851129|     2.0|
| blue| bad|     8|14.386294994851129|     4.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     5.0|
|green|good|    12|14.386294994851129|     0.0|
|green| bad|    16|14.386294994851129|     1.0|
|  red|good|    35|14.386294994851129|     6.0|
|  red| bad|     1| 38.97187133755819|     2.0|
|  red| bad|     2|14.386294994851129|     7.0|
|  red| bad|    16|14.386294994851129|     1.0|
|  red|good|    45| 38.97187133755819|     3.0|
|green|good|     1|14.386294994851129|     2.0|
| blue| bad|     8|14.386294994851129|     4.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     5.0|
|green|good|    12|14.386294994851129|     0.0|
|green| bad|    16|14.386294994851129|     1.0|
|  red|good|    35|14.386294994851129|  

In [42]:
from pyspark.ml.feature import StringIndexer
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
idxRes = lblIndxr.fit(simpleDF).transform(simpleDF)
idxRes.show()

+-----+----+------+------------------+--------+
|color| lab|value1|            value2|labelInd|
+-----+----+------+------------------+--------+
|green|good|     1|14.386294994851129|     1.0|
| blue| bad|     8|14.386294994851129|     0.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     1.0|
|green|good|    12|14.386294994851129|     1.0|
|green| bad|    16|14.386294994851129|     0.0|
|  red|good|    35|14.386294994851129|     1.0|
|  red| bad|     1| 38.97187133755819|     0.0|
|  red| bad|     2|14.386294994851129|     0.0|
|  red| bad|    16|14.386294994851129|     0.0|
|  red|good|    45| 38.97187133755819|     1.0|
|green|good|     1|14.386294994851129|     1.0|
| blue| bad|     8|14.386294994851129|     0.0|
| blue| bad|    12|14.386294994851129|     0.0|
|green|good|    15| 38.97187133755819|     1.0|
|green|good|    12|14.386294994851129|     1.0|
|green| bad|    16|14.386294994851129|     0.0|
|  red|good|    35|14.386294994851129|  

In [43]:
from pyspark.ml.feature import IndexToString
labelReverse = IndexToString().setInputCol("labelInd").setOutputCol("label_str")
labelReverse.transform(idxRes).show()

+-----+----+------+------------------+--------+---------+
|color| lab|value1|            value2|labelInd|label_str|
+-----+----+------+------------------+--------+---------+
|green|good|     1|14.386294994851129|     1.0|     good|
| blue| bad|     8|14.386294994851129|     0.0|      bad|
| blue| bad|    12|14.386294994851129|     0.0|      bad|
|green|good|    15| 38.97187133755819|     1.0|     good|
|green|good|    12|14.386294994851129|     1.0|     good|
|green| bad|    16|14.386294994851129|     0.0|      bad|
|  red|good|    35|14.386294994851129|     1.0|     good|
|  red| bad|     1| 38.97187133755819|     0.0|      bad|
|  red| bad|     2|14.386294994851129|     0.0|      bad|
|  red| bad|    16|14.386294994851129|     0.0|      bad|
|  red|good|    45| 38.97187133755819|     1.0|     good|
|green|good|     1|14.386294994851129|     1.0|     good|
| blue| bad|     8|14.386294994851129|     0.0|      bad|
| blue| bad|    12|14.386294994851129|     0.0|      bad|
|green|good|  

In [44]:
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.linalg import Vectors
idxIn = spark.createDataFrame([
  (Vectors.dense(1, 2, 3),1),
  (Vectors.dense(2, 5, 6),2),
  (Vectors.dense(1, 8, 9),3)
]).toDF("features", "label")
idxIn.show()

+-------------+-----+
|     features|label|
+-------------+-----+
|[1.0,2.0,3.0]|    1|
|[2.0,5.0,6.0]|    2|
|[1.0,8.0,9.0]|    3|
+-------------+-----+



In [45]:
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
indxr.fit(idxIn).transform(idxIn).show()

+-------------+-----+-------------+
|     features|label|        idxed|
+-------------+-----+-------------+
|[1.0,2.0,3.0]|    1|[0.0,2.0,3.0]|
|[2.0,5.0,6.0]|    2|[1.0,5.0,6.0]|
|[1.0,8.0,9.0]|    3|[0.0,8.0,9.0]|
+-------------+-----+-------------+



In [46]:
simpleDF.show(5)

+-----+----+------+------------------+
|color| lab|value1|            value2|
+-----+----+------+------------------+
|green|good|     1|14.386294994851129|
| blue| bad|     8|14.386294994851129|
| blue| bad|    12|14.386294994851129|
|green|good|    15| 38.97187133755819|
|green|good|    12|14.386294994851129|
+-----+----+------+------------------+
only showing top 5 rows



In [47]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
print(colorLab.show())
ohe = OneHotEncoder().setInputCol("colorInd")
print(ohe.transform(colorLab).show())

+-----+--------+
|color|colorInd|
+-----+--------+
|green|     1.0|
| blue|     2.0|
| blue|     2.0|
|green|     1.0|
|green|     1.0|
|green|     1.0|
|  red|     0.0|
|  red|     0.0|
|  red|     0.0|
|  red|     0.0|
|  red|     0.0|
|green|     1.0|
| blue|     2.0|
| blue|     2.0|
|green|     1.0|
|green|     1.0|
|green|     1.0|
|  red|     0.0|
|  red|     0.0|
|  red|     0.0|
+-----+--------+
only showing top 20 rows

None
+-----+--------+------------------------------------------+
|color|colorInd|OneHotEncoder_46f382a1c2d71f995fa1__output|
+-----+--------+------------------------------------------+
|green|     1.0|                             (2,[1],[1.0])|
| blue|     2.0|                                 (2,[],[])|
| blue|     2.0|                                 (2,[],[])|
|green|     1.0|                             (2,[1],[1.0])|
|green|     1.0|                             (2,[1],[1.0])|
|green|     1.0|                             (2,[1],[1.0])|
|  red|     0.0|     

In [48]:
sales.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 5 rows



In [49]:
from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(5, False)

+-------------------------------+-------------------------------------+
|Description                    |DescOut                              |
+-------------------------------+-------------------------------------+
|RABBIT NIGHT LIGHT             |[rabbit, night, light]               |
|DOUGHNUT LIP GLOSS             |[doughnut, lip, gloss]               |
|12 MESSAGE CARDS WITH ENVELOPES|[12, message, cards, with, envelopes]|
|BLUE HARMONICA IN BOX          |[blue, harmonica, in, box]           |
|GUMBALL COAT RACK              |[gumball, coat, rack]                |
+-------------------------------+-------------------------------------+
only showing top 5 rows



In [50]:
from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(5, False)

+-------------------------------+-------------------------------------+
|Description                    |DescOut                              |
+-------------------------------+-------------------------------------+
|RABBIT NIGHT LIGHT             |[rabbit, night, light]               |
|DOUGHNUT LIP GLOSS             |[doughnut, lip, gloss]               |
|12 MESSAGE CARDS WITH ENVELOPES|[12, message, cards, with, envelopes]|
|BLUE HARMONICA IN BOX          |[blue, harmonica, in, box]           |
|GUMBALL COAT RACK              |[gumball, coat, rack]                |
+-------------------------------+-------------------------------------+
only showing top 5 rows



In [51]:
from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setGaps(False)\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, True)

+--------------------+------------------+
|         Description|           DescOut|
+--------------------+------------------+
|  RABBIT NIGHT LIGHT|            [ ,  ]|
| DOUGHNUT LIP GLOSS |         [ ,  ,  ]|
|12 MESSAGE CARDS ...|      [ ,  ,  ,  ]|
|BLUE HARMONICA IN...|      [ ,  ,  ,  ]|
|   GUMBALL COAT RACK|            [ ,  ]|
|SKULLS  WATER TRA...|   [ ,  ,  ,  ,  ]|
|FELTCRAFT GIRL AM...|         [ ,  ,  ]|
|CAMOUFLAGE LED TORCH|            [ ,  ]|
|WHITE SKULL HOT W...|   [ ,  ,  ,  ,  ]|
|ENGLISH ROSE HOT ...|      [ ,  ,  ,  ]|
|HOT WATER BOTTLE ...|      [ ,  ,  ,  ]|
|SCOTTIE DOG HOT W...|      [ ,  ,  ,  ]|
|ROSE CARAVAN DOOR...|            [ ,  ]|
|GINGHAM HEART  DO...|      [ ,  ,  ,  ]|
|STORAGE TIN VINTA...|         [ ,  ,  ]|
|SET OF 4 KNICK KN...|[ ,  ,  ,  ,  ,  ]|
|      POPCORN HOLDER|               [ ]|
|GROW A FLYTRAP OR...|[ ,  ,  ,  ,  ,  ]|
|AIRLINE BAG VINTA...|   [ ,  ,  ,  ,  ]|
|AIRLINE BAG VINTA...|   [ ,  ,  ,  ,  ]|
+--------------------+------------

In [52]:
from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
  .setStopWords(englishStopWords)\
  .setInputCol("DescOut")\
  .setOutputCol("Output_no_stopwords")
stops.transform(tokenized).show()

+--------------------+--------------------+--------------------+
|         Description|             DescOut| Output_no_stopwords|
+--------------------+--------------------+--------------------+
|  RABBIT NIGHT LIGHT|[rabbit, night, l...|[rabbit, night, l...|
| DOUGHNUT LIP GLOSS |[doughnut, lip, g...|[doughnut, lip, g...|
|12 MESSAGE CARDS ...|[12, message, car...|[12, message, car...|
|BLUE HARMONICA IN...|[blue, harmonica,...|[blue, harmonica,...|
|   GUMBALL COAT RACK|[gumball, coat, r...|[gumball, coat, r...|
|SKULLS  WATER TRA...|[skulls, , water,...|[skulls, , water,...|
|FELTCRAFT GIRL AM...|[feltcraft, girl,...|[feltcraft, girl,...|
|CAMOUFLAGE LED TORCH|[camouflage, led,...|[camouflage, led,...|
|WHITE SKULL HOT W...|[white, skull, ho...|[white, skull, ho...|
|ENGLISH ROSE HOT ...|[english, rose, h...|[english, rose, h...|
|HOT WATER BOTTLE ...|[hot, water, bott...|[hot, water, bott...|
|SCOTTIE DOG HOT W...|[scottie, dog, ho...|[scottie, dog, ho...|
|ROSE CARAVAN DOOR...|[ro

In [53]:
from pyspark.ml.feature import NGram
unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)
unigram.transform(tokenized.select("DescOut")).show(10,False)
bigram.transform(tokenized.select("DescOut")).show(10,False)

+-------------------------------------+-------------------------------------+
|DescOut                              |NGram_486f8411644dc52295d8__output   |
+-------------------------------------+-------------------------------------+
|[rabbit, night, light]               |[rabbit, night, light]               |
|[doughnut, lip, gloss]               |[doughnut, lip, gloss]               |
|[12, message, cards, with, envelopes]|[12, message, cards, with, envelopes]|
|[blue, harmonica, in, box]           |[blue, harmonica, in, box]           |
|[gumball, coat, rack]                |[gumball, coat, rack]                |
|[skulls, , water, transfer, tattoos] |[skulls, , water, transfer, tattoos] |
|[feltcraft, girl, amelie, kit]       |[feltcraft, girl, amelie, kit]       |
|[camouflage, led, torch]             |[camouflage, led, torch]             |
|[white, skull, hot, water, bottle]   |[white, skull, hot, water, bottle]   |
|[english, rose, hot, water, bottle]  |[english, rose, hot, wate

In [54]:
# Converts word into numerical representations
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("countVec")\
  .setVocabSize(500)\
  .setMinTF(1)\
  .setMinDF(2)
fittedCV = cv.fit(tokenized)
dfcv = fittedCV.transform(tokenized)
dfcv.show(5)

+--------------------+--------------------+--------------------+
|         Description|             DescOut|            countVec|
+--------------------+--------------------+--------------------+
|  RABBIT NIGHT LIGHT|[rabbit, night, l...|(500,[149,185,212...|
| DOUGHNUT LIP GLOSS |[doughnut, lip, g...|(500,[462,463,492...|
|12 MESSAGE CARDS ...|[12, message, car...|(500,[35,41,166],...|
|BLUE HARMONICA IN...|[blue, harmonica,...|(500,[10,16,36,35...|
|   GUMBALL COAT RACK|[gumball, coat, r...|(500,[228,280,408...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [56]:
type(dfcv)

pyspark.sql.dataframe.DataFrame

In [57]:
dfcv.select('Description').show(1)

+------------------+
|       Description|
+------------------+
|RABBIT NIGHT LIGHT|
+------------------+
only showing top 1 row



In [59]:
from pyspark.sql.functions import col
col = dfcv.select('countVec').take(1)
print(type(col))
print(col[0])
print((col[0][0]).size) # size of sparse vector
print((col[0][0]).indices) # dict indices
print((col[0][0]).values)
print(col[0][0][212])

<class 'list'>
Row(countVec=SparseVector(500, {149: 1.0, 185: 1.0, 212: 1.0}))
500
[149 185 212]
[1. 1. 1.]
1.0


In [60]:
tokenized.show(5)

+--------------------+--------------------+
|         Description|             DescOut|
+--------------------+--------------------+
|  RABBIT NIGHT LIGHT|[rabbit, night, l...|
| DOUGHNUT LIP GLOSS |[doughnut, lip, g...|
|12 MESSAGE CARDS ...|[12, message, car...|
|BLUE HARMONICA IN...|[blue, harmonica,...|
|   GUMBALL COAT RACK|[gumball, coat, r...|
+--------------------+--------------------+
only showing top 5 rows



In [61]:
tfIdfIn = tokenized\
  .where("array_contains(DescOut, 'red')")\
  .select("DescOut")\
  .limit(10)
tfIdfIn.show(10, False)

+---------------------------------------+
|DescOut                                |
+---------------------------------------+
|[gingham, heart, , doorstop, red]      |
|[red, floral, feltcraft, shoulder, bag]|
|[alarm, clock, bakelike, red]          |
|[pin, cushion, babushka, red]          |
|[red, retrospot, mini, cases]          |
|[red, kitchen, scales]                 |
|[gingham, heart, , doorstop, red]      |
|[large, red, babushka, notebook]       |
|[red, retrospot, oven, glove]          |
|[red, retrospot, plate]                |
+---------------------------------------+



In [62]:
from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF()\
  .setInputCol("DescOut")\
  .setOutputCol("TFOut")\
  .setNumFeatures(10000)
idf = IDF()\
  .setInputCol("TFOut")\
  .setOutputCol("IDFOut")\
  .setMinDocFreq(2)

In [63]:
idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False)

+---------------------------------------+--------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------+
|DescOut                                |TFOut                                                   |IDFOut                                                                                                              |
+---------------------------------------+--------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------+
|[gingham, heart, , doorstop, red]      |(10000,[3372,4291,4370,6594,9160],[1.0,1.0,1.0,1.0,1.0])|(10000,[3372,4291,4370,6594,9160],[1.2992829841302609,0.0,1.2992829841302609,1.2992829841302609,1.2992829841302609])|
|[red, floral, feltcraft, shoulder, bag]|(10000,[155,1152,4291,5981,6756],[1.0,1.0,1.0,1.0,1.0]) |(10000,[155,1152,4291,5981,6756],[0.0,

In [64]:
from pyspark.ml.feature import Word2Vec
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text",
  outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.012779767811298371,-0.09340975657105446,-0.10830843970179559]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [0.07612769335641392,0.03451743721961975,-0.04290600613291774]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.06759414225816728,0.045298346877098085,0.05302179120481015]



# Feature Manipulation
## PCA

In [67]:
scaleDF.show()

+---+--------------+
| id|      features|
+---+--------------+
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  0|[1.0,0.1,-1.0]|
|  1| [2.0,1.1,1.0]|
|  1|[3.0,10.1,3.0]|
+---+--------------+



In [66]:
from pyspark.ml.feature import PCA
pca = PCA().setInputCol("features").setK(2)
pca.fit(scaleDF).transform(scaleDF).show(20, False)

+---+--------------+------------------------------------------+
|id |features      |PCA_4c96952c0cfacdb535a4__output          |
+---+--------------+------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.07137194992484153,-0.45266548881478463]|
|1  |[2.0,1.1,1.0] |[-1.6804946984073725,1.2593401322219144]  |
|0  |[1.0,0.1,-1.0]|[0.07137194992484153,-0.45266548881478463]|
|1  |[2.0,1.1,1.0] |[-1.6804946984073725,1.2593401322219144]  |
|1  |[3.0,10.1,3.0]|[-10.872398139848944,0.030962697060149758]|
+---+--------------+------------------------------------------+



In [72]:
fittedPCA = pca.fit(scaleDF)
fittedPCA.write().overwrite().save("fittedPCA")

In [73]:
from pyspark.ml.feature import PCAModel
loadedPCA = PCAModel.load("fittedPCA")
loadedPCA.transform(scaleDF).show()

+---+--------------+--------------------------------+
| id|      features|PCA_4c96952c0cfacdb535a4__output|
+---+--------------+--------------------------------+
|  0|[1.0,0.1,-1.0]|            [0.07137194992484...|
|  1| [2.0,1.1,1.0]|            [-1.6804946984073...|
|  0|[1.0,0.1,-1.0]|            [0.07137194992484...|
|  1| [2.0,1.1,1.0]|            [-1.6804946984073...|
|  1|[3.0,10.1,3.0]|            [-10.872398139848...|
+---+--------------+--------------------------------+



## Classification

In [74]:
from pyspark.sql import SparkSession 
spark = SparkSession\
        .builder\
        .appName("Python")\
        .getOrCreate()

In [76]:
bInput = spark.read.format("parquet").load("data/binary-classification")\
  .selectExpr("features", "cast(label as double) as label")
bInput.show()

+--------------+-----+
|      features|label|
+--------------+-----+
|[3.0,10.1,3.0]|  1.0|
|[1.0,0.1,-1.0]|  0.0|
|[1.0,0.1,-1.0]|  0.0|
| [2.0,1.1,1.0]|  1.0|
| [2.0,1.1,1.0]|  1.0|
+--------------+-----+



In [77]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
#print (lr.explainParams() ) # see all parameters
lrModel = lr.fit(bInput)

In [78]:
print (lrModel.coefficients)
print (lrModel.intercept)

[6.848741326854998,0.3535658901019746,14.814900276915903]
-10.225695864481022


In [79]:
summary = lrModel.summary
print (summary.areaUnderROC)
print(summary.roc.show())

1.0
+---+------------------+
|FPR|               TPR|
+---+------------------+
|0.0|               0.0|
|0.0|0.3333333333333333|
|0.0|               1.0|
|1.0|               1.0|
|1.0|               1.0|
+---+------------------+

None


In [80]:
summary.pr.show()

+------------------+---------+
|            recall|precision|
+------------------+---------+
|               0.0|      1.0|
|0.3333333333333333|      1.0|
|               1.0|      1.0|
|               1.0|      0.6|
+------------------+---------+



In [81]:
summary.objectiveHistory

[0.6730116670092565,
 0.5042829330409727,
 0.36356862066874396,
 0.1252407018038337,
 0.08532556611276214,
 0.03550487641573045,
 0.01819649450857126,
 0.008817369922959133,
 0.004413673785392145,
 0.002194038351234711,
 0.0010965641148080857,
 0.000547657551985314,
 0.00027376237951490083,
 0.00013684652236574735,
 6.841809037070565e-05,
 3.420707791038474e-05,
 1.710317666423187e-05,
 8.551470106426846e-06,
 4.275703677941403e-06,
 2.1378240117781205e-06,
 1.0688564054651793e-06,
 5.342600202575221e-07,
 2.6681351058971897e-07,
 1.32046278653146e-07,
 6.768401481683304e-08,
 3.3145477184846346e-08,
 1.615143883749056e-08,
 8.309350118268702e-09]

In [82]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
#print (dt.explainParams())
dtModel = dt.fit(bInput)

In [83]:
type(dtModel)

pyspark.ml.classification.DecisionTreeClassificationModel

In [None]:
# 

In [84]:
from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier()
#print (rfClassifier.explainParams())
trainedModel = rfClassifier.fit(bInput)

In [86]:
from pyspark.ml.classification import GBTClassifier
gbtClassifier = GBTClassifier()
#print (gbtClassifier.explainParams())
trainedModel = gbtClassifier.fit(bInput)

In [87]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
#print (nb.explainParams() )
trainedModel = nb.fit(bInput.where("label != 0"))

## Evaluate Models

In [89]:
bInput.show()

+--------------+-----+
|      features|label|
+--------------+-----+
|[3.0,10.1,3.0]|  1.0|
|[1.0,0.1,-1.0]|  0.0|
|[1.0,0.1,-1.0]|  0.0|
| [2.0,1.1,1.0]|  1.0|
| [2.0,1.1,1.0]|  1.0|
+--------------+-----+



In [92]:
# Evaluator
model = dtModel # decision tree model
from pyspark.mllib.evaluation import BinaryClassificationMetrics
out = model.transform(bInput)\
  .select("prediction", "label")\
  .rdd.map(lambda x: (float(x[0]), float(x[1])))
out.collect()

[(1.0, 1.0), (0.0, 0.0), (0.0, 0.0), (1.0, 1.0), (1.0, 1.0)]

In [93]:
metrics = BinaryClassificationMetrics(out)

In [95]:
print (metrics.areaUnderPR)
print (metrics.areaUnderROC)

1.0
1.0


## Regression

In [97]:
df = spark.read.load("data/regression")
df.show()

+--------------+-----+
|      features|label|
+--------------+-----+
|[3.0,10.1,3.0]|  2.0|
| [2.0,1.1,1.0]|  1.0|
|[1.0,0.1,-1.0]|  0.0|
|[1.0,0.1,-1.0]|  0.0|
| [2.0,4.1,1.0]|  2.0|
+--------------+-----+



In [98]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
#print (lr.explainParams())
lrModel = lr.fit(df)

In [99]:
summary = lrModel.summary
summary.residuals.show()
print (summary.totalIterations)
print (summary.objectiveHistory)
print (summary.rootMeanSquaredError)
print (summary.r2)

+--------------------+
|           residuals|
+--------------------+
|  0.1280504658561019|
|-0.14468269261572098|
| -0.4190383262242056|
| -0.4190383262242056|
|  0.8547088792080308|
+--------------------+

6
[0.5000000000000001, 0.4315295810362787, 0.3132335933881021, 0.312256926665541, 0.3091506081983029, 0.3091505893348027]
0.47308424392175985
0.720239122691221


In [100]:
lrModel.predictionCol

Param(parent='LinearRegression_4bab927054396d9080fd', name='predictionCol', doc='prediction column name')

In [102]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression()\
  .setFamily("gaussian")\
  .setLink("identity")\
  .setMaxIter(10)\
  .setRegParam(0.3)\
  .setLinkPredictionCol("linkOut")
#print (glr.explainParams())
glrModel = glr.fit(df)

In [103]:
from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
#print (dtr.explainParams())
dtrModel = dtr.fit(df)

In [104]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf =  RandomForestRegressor()
#print rf.explainParams()
rfModel = rf.fit(df)
gbt = GBTRegressor()
#print gbt.explainParams()
gbtModel = gbt.fit(df)

In [105]:
df.show()

+--------------+-----+
|      features|label|
+--------------+-----+
|[3.0,10.1,3.0]|  2.0|
| [2.0,1.1,1.0]|  1.0|
|[1.0,0.1,-1.0]|  0.0|
|[1.0,0.1,-1.0]|  0.0|
| [2.0,4.1,1.0]|  2.0|
+--------------+-----+



In [109]:
df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)



In [112]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity")
pipeline = Pipeline().setStages([glr])
params = ParamGridBuilder().addGrid(glr.regParam, [0.0, 0.5, 1.0]).build()
evaluator = RegressionEvaluator()\
  .setMetricName("rmse")\
  .setPredictionCol("prediction")\
  .setLabelCol("label")
cv = CrossValidator()\
  .setEstimator(pipeline)\
  .setEvaluator(evaluator)\
  .setEstimatorParamMaps(params)\
  .setNumFolds(2) # should always be 3 or more but this dataset is small
model = cv.fit(df)

In [113]:
from pyspark.mllib.evaluation import RegressionMetrics
model = model
out = model.transform(df)\
  .select("prediction", "label").rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = RegressionMetrics(out)
print ( "MSE: " + str(metrics.meanSquaredError))
print ( "RMSE: " + str(metrics.rootMeanSquaredError))
print ( "R-squared: " + str(metrics.r2))
print ( "MAE: " + str(metrics.meanAbsoluteError) )
print ( "Explained variance: " + str(metrics.explainedVariance) )

MSE: 0.15705521472392636
RMSE: 0.39630192369445594
R-squared: 0.803680981595092
MAE: 0.31411042944785267
Explained variance: 0.6429447852760728


## Recommendation Systems

In [115]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
ratings = spark.read.text("data/sample_movielens_ratings.txt")\
  .rdd.toDF()\
  .selectExpr("split(value , '::') as col")\
  .selectExpr(
    "cast(col[0] as int) as userId",
    "cast(col[1] as int) as movieId",
    "cast(col[2] as float) as rating",
    "cast(col[3] as long) as timestamp")
training, test = ratings.randomSplit([0.8, 0.2])
als = ALS()\
  .setMaxIter(5)\
  .setRegParam(0.01)\
  .setUserCol("userId")\
  .setItemCol("movieId")\
  .setRatingCol("rating")
#print (als.explainParams())
alsModel = als.fit(training)
predictions = alsModel.transform(test)

In [116]:
ratings.take(5)

[Row(userId=0, movieId=2, rating=3.0, timestamp=1424380312),
 Row(userId=0, movieId=3, rating=1.0, timestamp=1424380312),
 Row(userId=0, movieId=5, rating=2.0, timestamp=1424380312),
 Row(userId=0, movieId=9, rating=4.0, timestamp=1424380312),
 Row(userId=0, movieId=11, rating=1.0, timestamp=1424380312)]

In [117]:
type(predictions)
predictions.take(5)

[Row(userId=27, movieId=31, rating=1.0, timestamp=1424380312, prediction=1.0664252042770386),
 Row(userId=5, movieId=31, rating=1.0, timestamp=1424380312, prediction=-0.39794427156448364),
 Row(userId=29, movieId=31, rating=1.0, timestamp=1424380312, prediction=1.0999280214309692),
 Row(userId=5, movieId=85, rating=1.0, timestamp=1424380312, prediction=0.4406394064426422),
 Row(userId=8, movieId=85, rating=5.0, timestamp=1424380312, prediction=2.702888250350952)]

In [118]:
alsModel.recommendForAllUsers(10)\
  .selectExpr("userId", "explode(recommendations)").show()
alsModel.recommendForAllItems(10)\
  .selectExpr("movieId", "explode(recommendations)").show()

+------+---------------+
|userId|            col|
+------+---------------+
|    28|[92, 5.0727963]|
|    28|[81, 4.8877063]|
|    28|[12, 4.8776913]|
|    28|[89, 4.0290647]|
|    28| [2, 3.9999743]|
|    28| [49, 3.962494]|
|    28|[53, 3.9392726]|
|    28|[82, 3.7512054]|
|    28|[96, 3.4206526]|
|    28|[46, 3.3429942]|
|    26|[19, 7.0987396]|
|    26|[47, 5.6084614]|
|    26|[34, 5.5124083]|
|    26|[39, 5.2580194]|
|    26| [24, 5.198512]|
|    26|[12, 5.0430365]|
|    26|  [9, 4.975503]|
|    26|  [7, 4.956293]|
|    26| [23, 4.932685]|
|    26|   [88, 4.8356]|
+------+---------------+
only showing top 20 rows

+-------+---------------+
|movieId|            col|
+-------+---------------+
|     31|[21, 4.1652923]|
|     31| [12, 3.802142]|
|     31| [14, 3.046466]|
|     31| [9, 2.8910606]|
|     31|[10, 2.8666377]|
|     31|  [8, 2.765773]|
|     31| [7, 2.7095041]|
|     31|[15, 2.6663263]|
|     31| [3, 2.4565802]|
|     31|[25, 2.3014078]|
|     85|[16, 5.1440945]|
|     85| 

In [119]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator()\
  .setMetricName("r2")\
  .setLabelCol("rating")\
  .setPredictionCol("prediction")
r2 = evaluator.evaluate(predictions)
print("R2 = %f" % r2)

R2 = -0.860416


In [126]:
from pyspark.mllib.evaluation import RegressionMetrics
regComparison = predictions.select("rating", "prediction")\
  .rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = RegressionMetrics(regComparison)
print("RMSE = %s" % metrics.rootMeanSquaredError)
# R-squared
print("R-squared = %s" % metrics.r2)

RMSE = 1.6025909846682076
R-squared = -0.30107443747006335


In [127]:
from pyspark.mllib.evaluation import RankingMetrics, RegressionMetrics
from pyspark.sql.functions import col, expr
perUserActual = predictions\
  .where("rating > 2.5")\
  .groupBy("userId")\
  .agg(expr("collect_set(movieId) as movies"))

In [128]:
perUserPredictions = predictions\
  .orderBy(col("userId"), expr("prediction DESC"))\
  .groupBy("userId")\
  .agg(expr("collect_list(movieId) as movies"))

In [129]:
perUserActualvPred = perUserActual.join(perUserPredictions, ["userId"]).rdd\
  .map(lambda row: (row[1], row[2][:15]))
ranks = RankingMetrics(perUserActualvPred)

In [130]:
ranks.meanAveragePrecision

0.22926870138408603

In [131]:
ranks.precisionAt(5)

0.476923076923077

## Unsupervised Learning

In [132]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler()\
  .setInputCols(["Quantity", "UnitPrice"])\
  .setOutputCol("features")

In [133]:
sales = va.transform(spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("../data/retail-data/by-day/*.csv")
  .limit(50)
  .coalesce(1)
  .where("Description IS NOT NULL"))

sales.cache().show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|   features|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|[48.0,1.79]|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|[20.0,1.25]|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|[24.0,1.65]|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|[24.0,1.25]|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom| [6.0,2.55]|
|   580538|    21544|SKULLS  WATER TRA..

In [134]:
from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
#print (km.explainParams())
kmModel = km.fit(sales)

In [135]:
summary = kmModel.summary
print (summary.clusterSizes) # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

[10, 8, 29, 2, 1]
Cluster Centers: 
[23.2    0.956]
[ 2.5     11.24375]
[7.55172414 2.77172414]
[48.    1.32]
[36.    0.85]


In [136]:
from pyspark.ml.clustering import BisectingKMeans
bkm = BisectingKMeans().setK(5).setMaxIter(5)
bkmModel = bkm.fit(sales)

In [137]:
summary = bkmModel.summary
print (summary.clusterSizes) # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

[16, 8, 13, 10, 3]
Cluster Centers: 
[23.2    0.956]
[ 2.5     11.24375]
[7.55172414 2.77172414]
[48.    1.32]
[36.    0.85]


In [139]:
from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
#print (gmm.explainParams())
model = gmm.fit(sales)

In [140]:
summary = model.summary
print (model.weights)
print(model.gaussiansDF.show())

[0.16503937777770641, 0.35496420094056985, 0.06003637101912308, 0.1999636297743671, 0.21999642048823354]
+--------------------+--------------------+
|                mean|                 cov|
+--------------------+--------------------+
|[2.54180583818530...|0.785769315153778...|
|[5.07243095740621...|2.059950971034034...|
|[43.9877864408847...|32.22707068867282...|
|[23.1998836372414...|2.560279258630084...|
|[11.6364190345020...|1.322132750446848...|
+--------------------+--------------------+

None


In [141]:
summary.cluster.show()

+----------+
|prediction|
+----------+
|         2|
|         3|
|         3|
|         3|
|         1|
|         2|
|         4|
|         3|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         4|
|         1|
|         2|
|         3|
|         1|
|         1|
+----------+
only showing top 20 rows



In [142]:
summary.clusterSizes

[8, 18, 3, 10, 11]

In [143]:
summary.probability.show()

+--------------------+
|         probability|
+--------------------+
|[1.37632400885157...|
|[4.89041912245635...|
|[1.67299627008735...|
|[7.43321003719004...|
|[1.46369160111044...|
|[1.37558526306392...|
|[1.60356443149152...|
|[1.88134064420900...|
|[0.00607298994290...|
|[0.00607298994290...|
|[0.01533941819191...|
|[0.03609689525825...|
|[2.32052388159997...|
|[2.32052388159997...|
|[4.95219998798691...|
|[3.92383092236182...|
|[1.36205103659923...|
|[7.43321003719004...|
|[0.00607298994290...|
|[0.00607298994290...|
+--------------------+
only showing top 20 rows



## LDA - Topic Modeling topic modelling on text documents.

In [144]:
sales.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|   features|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+-----------+
|   580538|    23084|  RABBIT NIGHT LIGHT|      48|2011-12-05 08:38:00|     1.79|   14075.0|United Kingdom|[48.0,1.79]|
|   580538|    23077| DOUGHNUT LIP GLOSS |      20|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|[20.0,1.25]|
|   580538|    22906|12 MESSAGE CARDS ...|      24|2011-12-05 08:38:00|     1.65|   14075.0|United Kingdom|[24.0,1.65]|
|   580538|    21914|BLUE HARMONICA IN...|      24|2011-12-05 08:38:00|     1.25|   14075.0|United Kingdom|[24.0,1.25]|
|   580538|    22467|   GUMBALL COAT RACK|       6|2011-12-05 08:38:00|     2.55|   14075.0|United Kingdom| [6.0,2.55]|
+---------+---------+-------------------

In [145]:
from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("features")\
  .setVocabSize(500)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)

In [146]:
from pyspark.ml.clustering import LDA
lda = LDA().setK(10).setMaxIter(5)
#print (lda.explainParams())
model = lda.fit(prepped)

In [147]:
model.describeTopics(5).show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[137, 90, 49, 11,...|[0.00891024825887...|
|    1|[56, 29, 98, 16, 14]|[0.00916648435544...|
|    2|[15, 131, 45, 129...|[0.00897001776729...|
|    3|[6, 125, 78, 65, 27]|[0.00902238740730...|
|    4|[103, 55, 62, 108...|[0.00933169757372...|
|    5|[11, 23, 13, 5, 100]|[0.01462723703112...|
|    6|  [2, 7, 16, 94, 79]|[0.01779008733823...|
|    7|[30, 28, 73, 90, 10]|[0.01214763272764...|
|    8|   [0, 3, 1, 14, 35]|[0.01490962515356...|
|    9|[120, 94, 78, 131...|[0.00910675353890...|
+-----+--------------------+--------------------+



In [148]:
cvFitted.vocabulary[:10]

['water',
 'hot',
 'vintage',
 'bottle',
 'paperweight',
 '6',
 'home',
 'doormat',
 'landmark',
 'bicycle']