In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import pyspark.ml as M
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pyspark.sql.window as W

In [2]:
import psutil
NUM_WORKER = psutil.cpu_count(logical = False)

In [3]:
conf_spark = SparkConf().set("spark.driver.host", "127.0.0.1")
sc = SparkContext(conf = conf_spark)
sc.setLogLevel("ERROR")
spark = SparkSession(sc)
spark.conf.set("spark.sql.shuffle.partitions", NUM_WORKER)
spark

In [4]:
ratings = spark.read.csv("./ml-latest/ratings.csv",header=True)
ratings.head(3)

[Row(userId='1', movieId='307', rating='3.5', timestamp='1256677221'),
 Row(userId='1', movieId='481', rating='3.5', timestamp='1256677456'),
 Row(userId='1', movieId='1091', rating='1.5', timestamp='1256677471')]

In [5]:
# mock some random predictions
with_pred = ratings.withColumn("pred",F.rand()*5)
with_pred.show(5)

+------+-------+------+----------+------------------+
|userId|movieId|rating| timestamp|              pred|
+------+-------+------+----------+------------------+
|     1|    307|   3.5|1256677221| 4.822186159036702|
|     1|    481|   3.5|1256677456|2.7970743309415975|
|     1|   1091|   1.5|1256677471|2.3067363534948515|
|     1|   1257|   4.5|1256677460|1.6325673188131002|
|     1|   1449|   4.5|1256677264|1.5790215341001845|
+------+-------+------+----------+------------------+
only showing top 5 rows



In [6]:
def Rmse(with_pred_df, rating_col_name = "rating", pred_col_name = "pred"):
    return with_pred_df.select(F.sqrt(F.sum((F.col(rating_col_name) - F.col(pred_col_name))**2)/F.count(rating_col_name))).collect()[0][0]

In [7]:
Rmse(with_pred)

2.0694806918559077

In [21]:
def Acc(with_pred_df, rating_col_name = "rating", pred_col_name = "pred"):
    TP = ((F.col(rating_col_name) >= 3.5) & (F.col(pred_col_name) >= 3.5))
    TN = ((F.col(rating_col_name) < 3.5) & (F.col(pred_col_name) < 3.5))
    correct = with_pred_df.filter(TP | TN)
    return correct.count() / with_pred_df.count()

In [22]:
Acc(with_pred)

0.4514466024468891

In [34]:
def Coverage_k(with_pred_df, id_col_name, rating_col_name = "rating", pred_col_name = "pred", k=2):
    TP = ((F.col(rating_col_name) >= 3.5) & (F.col(pred_col_name) >= 3.5))
    num_covered = with_pred_df.select(id_col_name, rating_col_name, pred_col_name).filter(TP).groupBy(id_col_name).count()
    num_covered_bigger_than_k = num_covered.filter(f"count >= {k}")
    return num_covered_bigger_than_k.count() / num_covered.count()

In [41]:
Coverage_k(with_pred, id_col_name="movieId")

0.7237929946355317

In [36]:
Coverage_k(with_pred, id_col_name="userId")

0.8885445922212164

In [47]:
Coverage_k(with_pred, id_col_name="movieId", k = 5)

0.49643420637425056