In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext,SparkConf
spark = SparkSession.builder.\
            master("local").\
            appName("amzone").\
            enableHiveSupport().\
            getOrCreate()
sc = spark.sparkContext
# inputPath="data/sample.json"
# rawdata.select("reviewerID","asin","reviewerName","overall","reviewTime").show()
inputPath = "data/reviews_Books_5.json"
rawdata = spark.read.json(inputPath)
ratingDf = rawdata.select(rawdata.reviewerID.alias("user"),\
               rawdata.asin.alias("item"),\
               rawdata.overall.alias("rating")\
               )
ratingDf.registerTempTable("Ratings")
sqlString="with t1 as (select count(distinct(user)) as users from Ratings),\
           t2 as (select count(distinct(item)) as items from Ratings), \
           t3 as (select count(distinct(rating)) as ratings from Ratings) \
            select * from t1 cross join t2 cross join t3"
# rawdata.select("reviewerID").distinct().count()
resultDf = spark.sql(sqlString)
resultDf.show()
trainingData,cvData,testData = ratingDf.randomSplit([0.6,0.2,0.2])
# resultDf.show()
from pyspark.ml.pipeline import Pipeline,PipelineModel
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *
userIndexer = StringIndexer(inputCol="user",outputCol="userid",handleInvalid="error").fit(ratingDf)
itemIndexer = StringIndexer(inputCol="item",outputCol="itemid").fit(ratingDf)
alsModel = ALS(rank=5,userCol="userid",itemCol="itemid",ratingCol="rating",maxIter=5,regParam=0.0001)
pipeAls = Pipeline(stages=[userIndexer,itemIndexer,alsModel])
pipeAlsModel = pipeAls.fit(trainingData)
predictionAls = pipeAlsModel.transform(trainingData)
predictionAls.sort(asc("userid")).limit(10).show()
prediction = pipeAlsModel.transform(testData)
prediction.sort(asc("userid")).limit(10).show()
evaluator = RegressionEvaluator()\
      .setMetricName("rmse")\
      .setLabelCol("rating")\
      .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictionRecommendation.select('rating','prediction').na.drop())
print "RMSE:",rmse
sc.stop()
spark.stop()

AnalysisException: u'Path does not exist: hdfs://master1:9002/user/yanbin/data/reviews_Books_5.json;'

In [58]:
%matplotlib inline
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = "yan bin"
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.pipeline import Pipeline,PipelineModel
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *
from pyspark.sql.functions import *  
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit,CrossValidator
from pandas import *
import matplotlib as mpl
import time
from numpy import arange,logspace
import sys
mpl.use("Agg")
import matplotlib.pyplot as plt

def myInt(x):
    try:
        return int(x)
    except:
        return np.nan
    
class spark:
    __spark = 0
    __sc = 0


    def __init__(self, master="local", appName="sparkDemo"):
        conf = (SparkConf().setAppName(appName)
                .setMaster(master))
        self.__spark = (SparkSession.builder
                .config(conf=conf)
                .enableHiveSupport()
                .getOrCreate())
        self.__sc = self.__spark.sparkContext


    def getContext(self):
        return (self.__spark, self.__sc)


    def getJson(self, inputPath):
        spark, sc = self.getContext()
        rawData = spark.read.json(inputPath)
        ratingDf = (rawData.select(rawData.reviewerID.alias("user"),
                                   rawData.asin.alias("item"),
                                   rawData.overall.alias("rating")))
        return ratingDf
    
    def getText(self, inputPath):
        spark, sc = self.getContext()
        rawData = sc.textFile(inputPath)
        rawData = (rawData.map(lambda x:x.split("::"))
                            .map(lambda x:(myInt(x[0]),myInt(x[1]),myInt(x[2]),myInt(x[3]))))
        ratingDf =spark.createDataFrame(rawData,("user","item","rating","timestamp"))
        return ratingDf.na.drop() 
                 

    def drawPic(self,df,picSavePath):
        df.registerTempTable('df')
        sqlString ='''with t1 as (select itemid,count(itemid) as itemcnt from df group by itemid),
            t2 as (select userid,count(userid) as usercnt from df group by userid),
            t3 as (select userid,df.itemid,itemcnt from df,t1 
            where t1.itemid=df.itemid sort by userid desc),
            t4 as (select userid,sum(itemcnt)/count(itemcnt) as avgitem from t3 group by userid) 
            select t2.usercnt,t4.avgitem from t4 left join t2 on t2.userid=t4.userid'''
        pDf = spark.sql(sqlString).toPandas()
        pDf.plot(x='usercnt',y='avgitem',kind='scatter')
        plt.axis([0,5000,0,5000])
        plt.savefig(picSavePath)
        print "pic saved in %s" %(picSavePath)
        
    def drawPic2(self,df,picSavePath):
        df.registerTempTable('df')
        sqlString ="with t1 as (select itemid,count(itemid) as itemcnt from df group by itemid) select itemcnt from t1 sort by itemcnt desc"
        pDf = spark.sql(sqlString).toPandas()
        pDf.plot(y='avgitem',kind='scatter')
        plt.axis([0,5000,0,5000])
        plt.savefig(picSavePath)
        print "pic saved in %s" %(picSavePath)
    
    def indexer(self,df):
        userIndexer = StringIndexer(inputCol="user",outputCol="userid",handleInvalid="error").fit(df)
        itemIndexer = StringIndexer(inputCol="item",outputCol="itemid").fit(df)
        pipeAls = Pipeline(stages=[userIndexer,itemIndexer])
        pipeAlsModel = pipeAls.fit(df)
        return pipeAlsModel.transform(df)
    
    def onceAls(self,reg,inputDf,outputDf,rank=5,maxIter=5):
        als = ALS(rank=rank,userCol="userid",itemCol="itemid",ratingCol="rating",maxIter=maxIter,regParam=reg)
        alsModel = als.fit(inputDf)
        predictions =alsModel.transform(outputDf)
        evaluator = (RegressionEvaluator().setMetricName("rmse")
                    .setLabelCol("rating").setPredictionCol("prediction"))
        predictions = predictions.select('userid','itemid','rating','prediction').na.drop()
        rmse = evaluator.evaluate(predictions)
        return alsModel,rmse
    
    def regOpt(self,df,regParam=[0.0001],ranks=[5],maxIters=[5]):
        trainingData,cvData,testData = ratingDf.randomSplit(seed=40,weights=[0.6,0.2,0.2])
        trainingData.cache()
        rsme = sys.maxint
        best = [0,0,0]
        t=[]
        for i in range(len(regParam)):
            for j in range(len(maxIters)):
                for n in range(len(ranks)):
                    start = time.time()
                    tmprsme = self.onceAls(regParam[i],trainingData,cvData,maxIter=maxIters[j],rank=ranks[n])[1]
                    if tmprsme < rsme:
                        rsme = tmprsme
                        best = [i,j,n]
                    end = time.time()
                    t.append(end -start)
                    print "The reg:%f,the rsmes:%f,the maxIter:%d,the ranks:%d, time: %f s" \
                    %(regParam[i],rsme,ranks[n],maxIters[j],t[i])
        bestReg = regParam[best[0]]
        bestRank = ranks[best[1]]
        bestMaxIter = maxIters[best[2]]
        print "The best regParam is %f,the cv rsmes is %f,avg time:%f s" %(bestReg,rsme,__builtin__.sum(t)/len(t))
        print "The best Rank is %d,the best maxIter is %d" %(bestRank,bestMaxIter)
        alsModel,rmse = self.onceAls(bestReg,trainingData,testData)
        alsModel.transform(testData).limit(10).show()
        print "The als model rmse is %f" %rmse
        return alsModel
    
    def distince(self,df):
        ratingDf.registerTempTable("Ratings")
        sqlString="""with t1 as (select count(distinct(user)) as users from Ratings),
           t2 as (select count(distinct(item)) as items from Ratings), 
           t3 as (select count(distinct(rating)) as ratings from Ratings) 
           select * from t1 cross join t2 cross join t3"""
        self.__spark.sql(sqlString).show()
        

    def trainValidation(self,df):
        train,test = df.randomSplit([8.0,2.0],40)
        userIndexer = StringIndexer(inputCol="user",outputCol="userid",handleInvalid="error").fit(df)
        itemIndexer = StringIndexer(inputCol="item",outputCol="itemid").fit(df)
        als = ALS(rank=5,userCol="userid",
                       itemCol="itemid",ratingCol="rating",maxIter=5,regParam=0.0001)
        pipeAls = Pipeline(stages=[userIndexer,itemIndexer,als])
        evaluator = RegressionEvaluator()\
              .setMetricName("rmse")\
              .setLabelCol("rating")\
              .setPredictionCol("prediction")

        paramGrid = ParamGridBuilder() \
            .addGrid(als.maxIter,[5]) \
            .addGrid(als.regParam,logspace(0,0,1,base=3)*0.0001)\
            .build()
    
        tvs = TrainValidationSplit(estimator=pipeAls,
                                   estimatorParamMaps=paramGrid,
                                   evaluator=evaluator,
                                   # 80% of the data will be used for training, 20% for validation.
                                   trainRatio=0.8)
        model = tvs.fit(train)
        model.transform(test).show()
        return model
    
    
    def crossValidation(self,df):
        train,test = df.randomSplit([8.0,2.0])
        userIndexer = StringIndexer(inputCol="user",outputCol="userid",handleInvalid="error").fit(df)
        itemIndexer = StringIndexer(inputCol="item",outputCol="itemid").fit(df)
        als = ALS(rank=5,userCol="userid",
                       itemCol="itemid",ratingCol="rating",maxIter=5,regParam=0.0001)
        pipeAls = Pipeline(stages=[userIndexer,itemIndexer,als])
        evaluator = (RegressionEvaluator()
              .setMetricName("rmse")
              .setLabelCol("rating")
              .setPredictionCol("prediction"))
        paramGrid = (ParamGridBuilder() 
            .addGrid(als.maxIter,[5]) 
            .addGrid(als.regParam,logspace(0,4,5,base=3)*0.0001)
            .build())
        crossval = (CrossValidator(estimator=pipeAls,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2))  # use 3+ folds in practice

        cvModel = crossval.fit(train)
        cvModel.transform(test).show()
        return cvModel
        
        
        
if __name__ == "__main__":
    start = time.time()
    s = spark(appName="test")
    spark, sc = s.getContext()
    sc.setLogLevel('WARN')
    #   inputPath = "data/sample.json"
#     inputPath = "data/reviews_Books_5.json"
#     ratingDf = s.getJson(inputPath)
    inputPath = "data/ml-10M/ratings.dat"
#     inputPath = "data/ml-100k/u.data"
    ratingDf = s.getText(inputPath)
    model = s.trainValidation(ratingDf)
#     model = s.crossValidation(ratingDf)

#   s.drawPic(ratingDf)
#     regem = logspace(0,0,1,base=3)*0.0001
#     regem = arange(0.5,0.7,0.01)
#     ranks =[5,6,7,8,9,10]
#     maxIters=[5,8,10]
#     alsModel = s.regOpt(ratingDf,regem,ranks,maxIters)
    end = time.time()
    print "total:%f s" %(end-start)

    


+-----+----+------+----------+-------+------+----------+
| user|item|rating| timestamp| userid|itemid|prediction|
+-----+----+------+----------+-------+------+----------+
|65765| 368|     4| 867232385| 7880.0| 148.0| 3.6743104|
|31087| 368|     3| 992398730|10206.0| 148.0| 3.7230816|
|30601| 368|     4| 833483364|19204.0| 148.0|  2.986613|
|44672| 368|     5| 834232092|23015.0| 148.0|  4.562707|
|16466| 368|     1| 962925993|29719.0| 148.0| 2.4018764|
| 1835| 368|     5| 839110710|32414.0| 148.0| 3.9861476|
|14792| 368|     2| 840033082|35820.0| 148.0|  3.710929|
|26084| 368|     3| 845950586|36538.0| 148.0| 2.9671047|
| 9778| 368|     4|1054375063|57984.0| 148.0| 3.7690463|
|62611| 368|     3|1111328098|69042.0| 148.0| 3.0541353|
|27972| 368|     4| 953789985| 4161.0| 148.0| 3.7326388|
|23892| 368|     3| 944251866| 6393.0| 148.0| 3.4003396|
| 3679| 368|     4| 833024510| 6623.0| 148.0|  2.626106|
|24083| 368|     1| 851452982| 7417.0| 148.0|  2.907879|
|19517| 368|     3|1114057617|1

In [64]:
# ratingDf = s.getText(inputPath)
trainingDf,testDf=ratingDf.randomSplit([8.0,2.0],40)
trainPre = model.transform(trainingDf)
preDf = model.transform(testDf)
evaluator = (RegressionEvaluator().setMetricName("rmse")
                    .setLabelCol("rating").setPredictionCol("prediction"))
rmse1 = evaluator.evaluate(trainPre.select('userid','itemid','rating','prediction').na.drop())
rmse2 = evaluator.evaluate(preDf.select('userid','itemid','rating','prediction').na.drop())
print "训练集rmse:",rmse1
print "测试集rmse:",rmse2

训练集rmse: 0.782101560676
测试集rmse: 0.951588950666


In [70]:
ratingDf.registerTempTable('df')
preDf.registerTempTable('predf')
avgDf=spark.sql("select user,avg(rating) as avgrating from df group by user")
avgDf.registerTempTable('avgDf')
preDf2=spark.sql("select predf.user,predf.item,rating,case isNaN(predf.prediction) when True then avgDf.avgrating \
            else predf.prediction end as prediction from predf left join avgDf on predf.user=avgDf.user")
print evaluator.evaluate(preDf2)

0.951587813021


In [71]:
preDf.filter(isnan("prediction")).count()

40

In [None]:
pre

In [122]:
estimater=model.getEstimator()
alsModel=model.bestModel.stages[2]
print alsModel.extractParamMap()
stage = estimater.getStages()[2]
# stage.getParam("regParam")
print stage.extractParamMap()
# alsModel = model.bestModel.stages[2]
# alsModel.getMaxIter()

{}
{Param(parent=u'ALS_46639a276e1cee18354c', name='userCol', doc='column name for user ids. Ids must be within the integer value range.'): 'userid', Param(parent=u'ALS_46639a276e1cee18354c', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.'): 10, Param(parent=u'ALS_46639a276e1cee18354c', name='implicitPrefs', doc='whether to use implicit preference'): False, Param(parent=u'ALS_46639a276e1cee18354c', name='seed', doc='random seed.'): 593367982098446717, Param(parent=u'ALS_46639a276e1cee18354c', name='rank', doc='rank of the factorization'): 5, Param(parent=u'ALS_46639a276e1cee18354c', name='maxIter', doc='max number of iterations (>= 0).'): 5, Param(parent=u'ALS_46639a276e1cee18354c', name='regParam', doc='regularization parameter (>= 0).'): 0.0001, Param(parent=u'ALS_46639a276e1cee18354c', name='ratingCol', doc='column name for ratings'): 'rating', Param(parent=u'ALS_46639

+------+------+------+
|userid|itemid|rating|
+------+------+------+
|   2.0|   0.0|   5.0|
|  41.0|   0.0|   5.0|
|  46.0|   0.0|   5.0|
|   5.0|   0.0|   5.0|
|  34.0|   0.0|   5.0|
|  28.0|   0.0|   5.0|
|  74.0|   0.0|   5.0|
|  24.0|   0.0|   5.0|
|  14.0|   0.0|   5.0|
|  92.0|   0.0|   5.0|
|  80.0|   0.0|   5.0|
|  49.0|   0.0|   5.0|
|  67.0|   0.0|   5.0|
|  17.0|   0.0|   5.0|
|  10.0|   0.0|   5.0|
|   1.0|   0.0|   5.0|
|  12.0|   0.0|   5.0|
|  21.0|   0.0|   5.0|
|  68.0|   0.0|   5.0|
|  27.0|   0.0|   5.0|
+------+------+------+
only showing top 20 rows



In [3]:
# resultDf.show()
from pyspark.ml.pipeline import Pipeline,PipelineModel
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *
from pyspark.sql.functions import *
inputPath="data/sample.json"
# rawdata.select("reviewerID","asin","reviewerName","overall","reviewTime").show()
inputPath = "data/reviews_Books_5.json"
rawdata = spark.read.json(inputPath)
ratingDf = rawdata.select(rawdata.reviewerID.alias("user"),\
               rawdata.asin.alias("item"),\
               rawdata.overall.alias("rating"))
ratingDf = ratingDf.select(ratingDf.user,ratingDf.item,ratingDf.rating.cast(FloatType()))
ratingDf.registerTempTable("Ratings")
sqlString="with t1 as (select count(distinct(user)) as users from Ratings),\
           t2 as (select count(distinct(item)) as items from Ratings), \
           t3 as (select count(distinct(rating)) as ratings from Ratings) \
            select * from t1 cross join t2 cross join t3"
# rawdata.select("reviewerID").distinct().count()
resultDf = spark.sql(sqlString)

resultDf.show()
predictionAls.sort(asc("userid")).limit(10).show()
predictionRecommendation.sort(asc("userid")).limit(10).show()
rmse = evaluator.evaluate(predictionRecommendation.select('rating','prediction').na.drop())
print "RMSE:",rmse

+-----+-----+-------+
|users|items|ratings|
+-----+-----+-------+
|  100|    1|      5|
+-----+-----+-------+

+--------------+----------+------+------+------+----------+
|          user|      item|rating|userid|itemid|prediction|
+--------------+----------+------+------+------+----------+
| A1FQPOYRBTTK1|000100039X|   5.0|   0.0|   0.0|  4.999896|
|A1NPNGWBVD9AK3|000100039X|   5.0|   3.0|   0.0|  4.999896|
|A1MOSTXNIO5MPJ|000100039X|   5.0|   5.0|   0.0|  4.999896|
| AA6C78DHRK962|000100039X|   5.0|   8.0|   0.0|  4.999896|
| AMRZ5G7HF7I03|000100039X|   5.0|   9.0|   0.0|  4.999896|
|A2X6GEC6LCDN4S|000100039X|   5.0|  11.0|   0.0|  4.999896|
|A1TT4CY55WLHAR|000100039X|   5.0|  12.0|   0.0|  4.999896|
|A3H65DAAV98C8F|000100039X|   5.0|  13.0|   0.0|  4.999896|
| AAFLZI7MX9UIG|000100039X|   5.0|  15.0|   0.0|  4.999896|
|A2ZB1G1KUE6OS6|000100039X|   2.0|  19.0|   0.0| 1.9999582|
+--------------+----------+------+------+------+----------+

+--------------------+----------+------+------+-

Py4JJavaError: An error occurred while calling o179.evaluate.
: org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1072)
	at org.apache.spark.sql.catalyst.expressions.EqualNullSafe.eval(predicates.scala:470)
	at org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin$.org$apache$spark$sql$catalyst$optimizer$EliminateOuterJoin$$canFilterOutNull(joins.scala:116)
	at org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin$$anonfun$7.apply(joins.scala:125)
	at org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin$$anonfun$7.apply(joins.scala:125)
	at scala.collection.LinearSeqOptimized$class.exists(LinearSeqOptimized.scala:93)
	at scala.collection.immutable.List.exists(List.scala:84)
	at org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin$.org$apache$spark$sql$catalyst$optimizer$EliminateOuterJoin$$buildNewJoinType(joins.scala:125)
	at org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin$$anonfun$apply$2.applyOrElse(joins.scala:140)
	at org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin$$anonfun$apply$2.applyOrElse(joins.scala:138)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:288)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:287)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:293)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5.apply(TreeNode.scala:331)
	at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:188)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformChildren(TreeNode.scala:329)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:293)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:277)
	at org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin$.apply(joins.scala:138)
	at org.apache.spark.sql.catalyst.optimizer.EliminateOuterJoin$.apply(joins.scala:105)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:85)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:82)
	at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
	at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
	at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:82)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:74)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:74)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:73)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:73)
	at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:75)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:84)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:84)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:87)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:87)
	at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:2547)
	at org.apache.spark.sql.Dataset.rdd(Dataset.scala:2544)
	at org.apache.spark.ml.evaluation.RegressionEvaluator.evaluate(RegressionEvaluator.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Unseen label: null.
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:170)
	at org.apache.spark.ml.feature.StringIndexerModel$$anonfun$4.apply(StringIndexer.scala:166)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2.apply(ScalaUDF.scala:89)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF$$anonfun$2.apply(ScalaUDF.scala:88)
	at org.apache.spark.sql.catalyst.expressions.ScalaUDF.eval(ScalaUDF.scala:1069)
	... 59 more


In [5]:
trainingData,cvData,testData = ratingDf.randomSplit([0.6,0.2,0.2])
userIndexer = StringIndexer(inputCol="user",outputCol="userid",handleInvalid="error").fit(ratingDf)
itemIndexer = StringIndexer(inputCol="item",outputCol="itemid").fit(ratingDf)
alsModel = ALS(rank=5,userCol="userid",itemCol="itemid",ratingCol="rating",maxIter=5,regParam=0.0001)
pipeAls = Pipeline(stages=[userIndexer,itemIndexer,alsModel])
pipeAlsModel = pipeAls.fit(trainingData)
predictionAls = pipeAlsModel.transform(trainingData)
predictionRecommendation= pipeAlsModel.transform(testData)
evaluator = RegressionEvaluator()\
      .setMetricName("rmse")\
      .setLabelCol("rating")\
      .setPredictionCol("prediction")

IllegalArgumentException: u'requirement failed: Nothing has been added to this summarizer.'

In [6]:
predictionRecommendation.select('itemid','userid','rating','prediction').na.drop().show()

+------+------+------+----------+
|itemid|userid|rating|prediction|
+------+------+------+----------+
+------+------+------+----------+



In [10]:
ratingDf.show()

+--------------------+----------+------+
|              userid|    itemid|rating|
+--------------------+----------+------+
|A10000012B7CGYKOM...|000100039X|   5.0|
|      A2S166WSCFIFP5|000100039X|   5.0|
|      A1BM81XB4QHOA3|000100039X|   5.0|
|      A1MOSTXNIO5MPJ|000100039X|   5.0|
|      A2XQ5LZHTD4AFT|000100039X|   5.0|
|      A3V1MKC2BVWY48|000100039X|   5.0|
|      A12387207U8U24|000100039X|   5.0|
|      A29TRDMK51GKZR|000100039X|   5.0|
|      A3FI0744PG1WYG|000100039X|   5.0|
|      A2LBBQHYLEHM7P|000100039X|   5.0|
|      A1340OFLZBW5NG|000100039X|   5.0|
|      A2KU9IU07LOJS1|000100039X|   5.0|
|      A2WVHIRDMLM82E|000100039X|   5.0|
|      A2I35JB67U20C0|000100039X|   5.0|
|      A19N3FCQCLJYUA|000100039X|   5.0|
|      A3FFNE1DR5SI1W|000100039X|   5.0|
|      A1TT4CY55WLHAR|000100039X|   5.0|
|      A2X4HE21JTAL98|000100039X|   5.0|
|       ARDQ9KNB8K22N|000100039X|   5.0|
|       A27ZH1AQORJ1L|000100039X|   5.0|
+--------------------+----------+------+
only showing top

In [38]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Prepare training documents, which are labeled.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)

Row(id=4, text=u'spark i j k', probability=DenseVector([0.2661, 0.7339]), prediction=1.0)
Row(id=5, text=u'l m n', probability=DenseVector([0.9209, 0.0791]), prediction=0.0)
Row(id=6, text=u'mapreduce spark', probability=DenseVector([0.4429, 0.5571]), prediction=1.0)
Row(id=7, text=u'apache hadoop', probability=DenseVector([0.8584, 0.1416]), prediction=0.0)


In [39]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# Prepare training and test data.
data = spark.read.format("libsvm")\
    .load("file:///home/yanbin/hadoop-2.6/spark/data/mllib/sample_linear_regression_data.txt")
train, test = data.randomSplit([0.9, 0.1], seed=12345)

lr = LinearRegression(maxIter=10)

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

# Make predictions on test data. model is the model with combination of parameters
# that performed best.
model.transform(test)\
    .select("features", "label", "prediction")\
    .show()

+--------------------+--------------------+--------------------+
|            features|               label|          prediction|
+--------------------+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|  -23.51088409032297| -1.6659388625179559|
|(10,[0,1,2,3,4,5,...| -21.432387764165806|  0.3400877302576284|
|(10,[0,1,2,3,4,5,...| -12.977848725392104|-0.02335359093652395|
|(10,[0,1,2,3,4,5,...| -11.827072996392571|  2.5642684021108417|
|(10,[0,1,2,3,4,5,...| -10.945919657782932| -0.1631314487734783|
|(10,[0,1,2,3,4,5,...|  -10.58331129986813|   2.517790654691453|
|(10,[0,1,2,3,4,5,...| -10.288657252388708| -0.9443474180536754|
|(10,[0,1,2,3,4,5,...|  -8.822357870425154|  0.6872889429113783|
|(10,[0,1,2,3,4,5,...|  -8.772667465932606|  -1.485408580416465|
|(10,[0,1,2,3,4,5,...|  -8.605713514762092|   1.110272909026478|
|(10,[0,1,2,3,4,5,...|  -6.544633229269576|  3.0454559778611285|
|(10,[0,1,2,3,4,5,...|  -5.055293333055445|  0.6441174575094268|
|(10,[0,1,2,3,4,5,...|  -

In [52]:
inputPath="data/ratings_Electronics_sample.csv"
rawdata = spark.read.csv(inputPath)
df = rawdata.select(rawdata._c0.alias("user"),\
                       rawdata._c1.alias("item"),\
                       rawdata._c2.astype("float").alias("rating"),\
                       rawdata._c3.alias("timestamp"))
trainingData,cvData,testData = df.randomSplit([0.6,0.2,0.2])

In [54]:
from pyspark.ml.pipeline import Pipeline,PipelineModel
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import *



+--------------+----------+------+----------+------+------+----------+
|          user|      item|rating| timestamp|userid|itemid|prediction|
+--------------+----------+------+----------+------+------+----------+
|A2SRW2W1GXYKQZ|0594481813|   5.0|1403827200|   1.0|   1.0|  4.999883|
|A2OC1UK7VMBLOT|0594481813|   4.0|1398643200|   2.0|   1.0| 3.9999063|
| A6DI4IE8MKFYR|0972683275|   5.0|1363219200|   3.0|   0.0|  4.999068|
|A106YUCY4SVX1D|0972683275|   5.0|1310083200|   4.0|   0.0|  4.999068|
|A3OYE7X2O08LNT|0972683275|   4.0|1360195200|   5.0|   0.0| 3.9992542|
|A2ULCET06LOPBB|0972683275|   5.0|1369785600|   6.0|   0.0|  4.999068|
|A3M122DYN9L5N8|0594451647|   1.0|1388707200|   8.0|   4.0| 0.9999989|
|A3IIGCFLKVFW8M|0972683275|   5.0|1393459200|   9.0|   0.0|  4.999068|
|A39Z4OU2C7ENWH|0972683275|   3.0|1328572800|  10.0|   0.0| 2.9994407|
|A3T3XKC7H5ACI1|0594033934|   5.0|1401235200|  12.0|  16.0|   4.99998|
|A1RPEK98P97J7W|0972683275|   5.0|1359504000|  14.0|   0.0|  4.999068|
|A3S0R

In [49]:
from pyspark.sql.functions import *
prediction.select("prediction").distinct().count()

1