In [1]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, countDistinct
spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option","some-value").getOrCreate()
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option","some-value").getOrCreate()
df = spark.read.format("csv").option("header", "false").load('/Users/Joyce/Desktop/Project AMZ/ratings_Electronics.csv')
df = df.select(col("_c0").alias("userId"), col("_c1").alias("itemId"), col("_c2").alias("rating"), col("_c3").alias("timestamp"))

In [2]:
spark

In [3]:
def calculate_sparsity(itemlimit, userlimit, df):
    product = df.groupBy("itemId").count()
    product_filter = product.filter(product['count'] > itemlimit)
    Data = df.join(product_filter, ['itemId'], 'leftsemi')
    
    user = Data.groupBy("userId").count()
    user_filter = user.filter(user['count'] > userlimit)
    DF = Data.join(user_filter, ['userId'], 'leftsemi')
    
    available = DF.count()
    product_total = DF.select("itemId").distinct().count()
    user_total = DF.select("userId").distinct().count()
    
    print("available rating: " + str(available))
    print("distinct product: " + str(product_total))
    print("distinct user: " + str(user_total))
    
    result = (float(available)/(float(product_total) * float(user_total)))*100
    print(result)
    
    return DF

In [4]:
df=calculate_sparsity(2, 2, df)

available rating: 3374805
distinct product: 208371
distinct user: 615996
0.002629259887833436


In [5]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql import SQLContext
stringIndexer = StringIndexer(inputCol="itemId", outputCol="ProductIndex")
user = stringIndexer.fit(df)
indexed = user.transform(df)

In [6]:
indexed_distinct=indexed.select("userId").distinct()
from pyspark.sql.types import *
User_id = sqlContext.createDataFrame(indexed_distinct.rdd.map(lambda x: x[0]).zipWithIndex(), StructType([StructField("userId", StringType(), True),StructField("User_ID", IntegerType(), True)]))
join=indexed.join(User_id,indexed.userId == User_id.userId)
from pyspark.sql.functions import udf, col, regexp_replace
def inte(f):
    return int(f)
inte_udf = udf(inte)
rating_data = join.withColumn('Product_ID', inte_udf(col("ProductIndex")))

from pyspark.sql.types import FloatType, IntegerType
rating_data = rating_data.withColumn("Product_ID", rating_data["Product_ID"].cast(IntegerType()))
rating_data = rating_data.withColumn("rating", rating_data["rating"].cast(FloatType()))
rating_cleaned = rating_data.select('User_ID', 'Product_ID', 'rating')
rating_final=rating_cleaned.rdd

In [7]:
train_rdd, valid_rdd, test_rdd = rating_final.randomSplit(weights=[6, 2, 2], seed=0)
train_df = train_rdd.toDF().cache()
valid_df = valid_rdd.toDF().cache()
test_df = test_rdd.toDF().cache()

In [8]:
train_rdd.count()

2023607

In [9]:
items = train_rdd.map(lambda x: x[1]).distinct()
#items.take(10)
maxIndex = items.count()

In [10]:
maxIndex

186959

In [11]:
train_df.show()

+-------+----------+------+
|User_ID|Product_ID|rating|
+-------+----------+------+
|      0|     13093|   4.0|
|      0|     10620|   5.0|
|      1|     38247|   5.0|
|      2|     14873|   5.0|
|      3|     20443|   5.0|
|      3|     17759|   5.0|
|      3|       264|   5.0|
|      5|     26388|   5.0|
|      5|        30|   5.0|
|      6|     29881|   5.0|
|      6|     30026|   5.0|
|      6|      4215|   3.0|
|      6|    102304|   5.0|
|      6|     48013|   5.0|
|      6|      6807|   4.0|
|      6|    164028|   4.0|
|      6|    107023|   4.0|
|      6|      8387|   4.0|
|      6|      4411|   5.0|
|      7|     37380|   3.0|
+-------+----------+------+
only showing top 20 rows



In [12]:
train_rdd = train_rdd.map(lambda x: (x.User_ID, [(x.Product_ID, x['rating'])]))
train_rdd.take(6)

[(0, [(13093, 4.0)]),
 (0, [(10620, 5.0)]),
 (1, [(38247, 5.0)]),
 (2, [(14873, 5.0)]),
 (3, [(20443, 5.0)]),
 (3, [(17759, 5.0)])]

In [13]:
train_rdd = train_rdd.reduceByKey(lambda a, b: a + b)
train_rdd.take(6)

[(0, [(13093, 4.0), (10620, 5.0)]),
 (512000, [(641, 4.0), (10429, 1.0)]),
 (8200, [(44888, 4.0)]),
 (16400, [(2887, 5.0)]),
 (378200,
  [(126412, 5.0), (32209, 4.0), (131053, 3.0), (151806, 3.0), (6769, 3.0)]),
 (434200, [(289, 5.0)])]

In [14]:
import numpy as np
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors

train_new = train_rdd.map(lambda x: (x[0], Vectors.sparse(maxIndex*2, x[1])))

train_new.take(5)

[(0, SparseVector(373918, {10620: 5.0, 13093: 4.0})),
 (512000, SparseVector(373918, {641: 4.0, 10429: 1.0})),
 (8200, SparseVector(373918, {44888: 4.0})),
 (16400, SparseVector(373918, {2887: 5.0})),
 (378200,
  SparseVector(373918, {6769: 3.0, 32209: 4.0, 126412: 5.0, 131053: 3.0, 151806: 3.0}))]

In [15]:
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col


dataA = train_new
dfA = spark.createDataFrame(dataA, ["id", "features"])

dataB = train_new
dfB = spark.createDataFrame(dataB, ["id", "features"])

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)

model = mh.fit(dfA)

print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(dfA).show()

print("Approximately joining dfA and dfB on distance smaller than 0.9:")
model.approxSimilarityJoin(dfA, dfA, 0.9, distCol="JaccardDistance")\
    .select(col("datasetA.id").alias("idA"),
            col("datasetB.id").alias("idB"),
            col("JaccardDistance")).show()


The hashed dataset where hashed values are stored in the column 'hashes':
+------+--------------------+--------------------+
|    id|            features|              hashes|
+------+--------------------+--------------------+
|     0|(373918,[10620,13...|[[-1.267356126E9]...|
|512000|(373918,[641,1042...|[[-3.43236794E8],...|
|  8200|(373918,[44888],[...|[[1.155334799E9],...|
| 16400|(373918,[2887],[5...|[[-1.343943744E9]...|
|378200|(373918,[6769,322...|[[-1.328428554E9]...|
|434200|(373918,[289],[5.0])|[[-8.78646106E8],...|
|175400|(373918,[50642],[...|[[-1.293627691E9]...|
|237600|(373918,[9370,128...|[[-1.877861557E9]...|
|449200|(373918,[335,669]...|[[-2.004435784E9]...|
|264600|(373918,[1823],[5...|[[-3.2679196E8], ...|
| 41000|(373918,[1305,746...|[[5.66116488E8], ...|
| 93400|(373918,[2948],[5...|[[-4.09248541E8],...|
| 83200|(373918,[1060,120...|[[-1.143997308E9]...|
|254000|(373918,[1540,203...|[[-1.96104244E8],...|
|417800|(373918,[3430,656...|[[-6.27828543E8],...|
|261600|

In [16]:
key = Vectors.sparse(512000, [641, 10429], [4.0, 1.0])
#key = Vectors.sparse(0, [10620, 13093], [5.0, 4.0])
model_df = model.approxNearestNeighbors(dfA, key, 50)

In [17]:
model_df.show()

+------+--------------------+--------------------+------------------+
|    id|            features|              hashes|           distCol|
+------+--------------------+--------------------+------------------+
|512000|(373918,[641,1042...|[[-3.43236794E8],...|               0.0|
|310373|(373918,[641],[5.0])|[[-3.43236794E8],...|               0.5|
|290078|(373918,[641],[5.0])|[[-3.43236794E8],...|               0.5|
|201958|(373918,[641],[4.0])|[[-3.43236794E8],...|               0.5|
| 33607|(373918,[641],[5.0])|[[-3.43236794E8],...|               0.5|
|248038|(373918,[641],[3.0])|[[-3.43236794E8],...|               0.5|
|558228|(373918,[641],[5.0])|[[-3.43236794E8],...|               0.5|
|100215|(373918,[641],[5.0])|[[-3.43236794E8],...|               0.5|
|479884|(373918,[641],[4.0])|[[-3.43236794E8],...|               0.5|
|551804|(373918,[641],[5.0])|[[-3.43236794E8],...|               0.5|
|396375|(373918,[641],[5.0])|[[-3.43236794E8],...|               0.5|
|591221|(373918,[641

In [18]:
import numpy as np
import scipy.sparse as sps
from pyspark.ml.linalg import Vectors
train_predict = train_rdd.map(lambda x: (x[0],Vectors.sparse(5000000, x[1])))#(Vectors.sparse(x[0], x[1])))
train_predict_id = train_rdd.map(lambda x: ((x[0])))
train_predict.take(10)

[(0, SparseVector(5000000, {10620: 5.0, 13093: 4.0})),
 (512000, SparseVector(5000000, {641: 4.0, 10429: 1.0})),
 (8200, SparseVector(5000000, {44888: 4.0})),
 (16400, SparseVector(5000000, {2887: 5.0})),
 (378200,
  SparseVector(5000000, {6769: 3.0, 32209: 4.0, 126412: 5.0, 131053: 3.0, 151806: 3.0})),
 (434200, SparseVector(5000000, {289: 5.0})),
 (175400, SparseVector(5000000, {50642: 5.0})),
 (237600, SparseVector(5000000, {9370: 3.0, 12828: 5.0, 41621: 5.0})),
 (449200, SparseVector(5000000, {335: 5.0, 669: 5.0})),
 (264600, SparseVector(5000000, {1823: 5.0}))]

In [99]:
def recommend(recommended):
    recommended_list = []
    self = recommended[0][0].__str__().replace("[",'').replace(']','').replace(")","").split(",")[1:]
    self = self[:int(len(self)/2)]
    for i in recommended:
        lst = i[0].__str__().replace("[",'').replace(']','').replace(")","").split(",")[1:]
        recommended_list.extend(lst[:int(len(lst)/2)])
        recommended_list = list(set(recommended_list))
    recommendation = list(set(recommended_list)-set(self))    
    return(recommendation)

In [None]:
train_to_predict = train_predict.take(5)
for i in range(len(train_to_predict)):
    model_predicted = model.approxNearestNeighbors(dfA,train_to_predict[i][1], 5)
    recommended = model_predicted.select('features').take(model_predicted.count())
    print((train_to_predict[i][0]), recommend(recommended))
    

0 ['32236', '188233']
512000 []
8200 ['87062', '33566', '11953']
