## KMeans Clustering

### 建立KMeans 模型

In [None]:
%pyspark
raw_data = sc.textFile('file:/tmp/iris.csv')
raw_data.take(3)

header = raw_data.first()
skip_data = raw_data.filter(lambda line : line != header)
skip_data.take(3)

import numpy 
parsedData = skip_data.map(lambda e: \
    numpy.array( [float(ele) for ele in e.split(',')[0:4]] ) )
parsedData.take(3)

from pyspark.mllib.clustering import KMeans
clusters = KMeans.train(parsedData, 4, maxIterations=10,runs=30, initializationMode="random")

iris1 = parsedData.first()
iris1
clusters.predict(iris1)

prediction = clusters.predict(parsedData)
prediction.collect()

### 計算  Withing Cluster Sum of Square Error 

In [None]:
%pyspark

from math import sqrt
def error(point):
    center = clusters.centers[clusters.predict(point)] 
    return sqrt(sum([x**2 for x in (point - center)]))


In [None]:
%pyspark
parsedData.take(3)
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x,y: x+ y)
WSSSE

## 客戶分群

In [None]:
raw_data = sc.textFile('file:/tmp/customers.csv')
raw_data.take(3)
header = raw_data.first()
skip_data = raw_data.filter(lambda line: line != header )
parsed_data = skip_data.map(lambda e: numpy.array(e.split(',')[3:]) )

from pyspark.mllib.clustering import KMeans
clusters = KMeans.train(parsed_data, 5, maxIterations=10,runs=30, initializationMode="random")
predictions = clusters.predict(parsed_data).collect()
predictions

## ALS 推薦

### 載入資料

In [None]:
%pyspark
rawData = sc.textFile("/tmp/u.data") 
rawData.first()

rawRatings = rawData.map(lambda e: e.split()) 
rawRatings.take(3)

### 資料轉換

In [None]:
%pyspark
from pyspark.sql import Row
#將資料轉進ratingsRDD 物件 
ratingsRDD = rawRatings.map(\
           lambda p: Row(userId=int(p[0]), \
           movieId=int(p[1]), \
           rating=float(p[2]), \
           timestamp=int(p[3])))

ratingsRDD.take(3)

In [None]:
%pyspark
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])


### 產生模型

In [None]:
%pyspark
from pyspark.ml.recommendation import ALS
als = ALS(rank=50, maxIter=10, regParam=0.01, \
      userCol="userId", itemCol="movieId", \
      ratingCol="rating")
model = als.fit(training)
model

### 檢視模型

In [None]:
%pyspark
#dir(model)
#model.itemFactors.take(1)
model.itemFactors.count()
model.userFactors.count()

### 產生topk 推薦

In [None]:
%pyspark
userRecs = model.recommendForAllUsers(10)
userRecs.select('recommendations').take(1)

movieRecs = model.recommendForAllItems(10)
movieRecs.select('recommendations').take(1)
