In [None]:
from spark_evaluation import SparkRankingEvaluation

In [None]:
from pysarplus import SARPlus
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.sql.types import *

MAX_MEMORY = '15G'
# Initialize a spark session.
conf = pyspark.SparkConf().setMaster("local[*]") \
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set("spark.core.connection.ack.wait.timeout", "3600") \
        .set("spark.executor.memory", MAX_MEMORY) \
        .set("spark.driver.memory", MAX_MEMORY) \
        .set("spark.sql.legacy.timeParserPolicy", "LEGACY")
        
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("transaction_data") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

spark = init_spark()


In [None]:
# spark dataframe with user/item/rating/optional timestamp tuples
df = spark.read.csv('./clean_data.csv', header=True)
df = df.select('UserId', 'ItemCode', 'NumberOfItemsPurchased', 'TransactionTime')\
    .withColumn('timestamp', to_timestamp(df['TransactionTime'], 'E MMM dd HH:mm:ss zzz yyyy').cast('long'))\
    .withColumnRenamed("UserId", "userID") \
    .withColumnRenamed("ItemCode", "itemID") \
    .withColumnRenamed("NumberOfItemsPurchased", "rating")\
    .drop("TransactionTime")\
    .sort("timestamp")
    

split_ratio = 0.7

split_index = int(df.count() * split_ratio)

train_df = df.limit(split_index)
test_df = df.subtract(train_df)

print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=train_df.count(),
    train_users=train_df.select('userID').distinct().count(),
    train_items=train_df.select('itemID').distinct().count(),
    test_total=test_df.count(),
    test_users=test_df.select('userID').distinct().count(),
    test_items=test_df.select('itemID').distinct().count(),
))

In [None]:
model = SARPlus(
    spark,
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_timestamp="timestamp",
    similarity_type="jaccard",
    cache_path="cache",
)
model.fit(train_df)

In [None]:
k=10
pred_df = model.recommend_k_items(test_df, top_k=k, remove_seen=False)

In [None]:
evaluation = SparkRankingEvaluation(rating_true=test_df, rating_pred=pred_df.withColumnRenamed('score', 'prediction'))

In [None]:
print(f"map@{k}\t\t", evaluation.map_at_k())
print(f"ndcg@{k}\t\t", evaluation.ndcg_at_k())
print(f"precision@{k}\t", evaluation.precision_at_k())
print(f"recall@{k}\t", evaluation.recall_at_k())