In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
import psutil
NUM_WORKER = psutil.cpu_count(logical = False)

def Spark():
    """[summary]
    Returns:
        [type]: [description]
    """    
    conf_spark = SparkConf().set("spark.driver.host", "127.0.0.1")
    sc = SparkContext(conf = conf_spark)
    spark = SparkSession(sc)
    spark.conf.set("spark.sql.shuffle.partitions", NUM_WORKER)
    print('Spark UI address {}'.format(spark.sparkContext.uiWebUrl))
    return spark

sess = Spark()

Spark UI address http://127.0.0.1:4040


In [2]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [3]:
# read in ratings and movies
ratings_fp = "./ml-latest/ratings.csv"
movies_fp = "./ml-latest/movies.csv"
ratings = sess.read.csv(ratings_fp,header=True)
movies = sess.read.csv(movies_fp,header=True)

In [4]:
# 1. randomly sample the ratings  (frac=0.001 would get approx. 2w+ unique user and 5,000+ unique movies)
frac = 0.001 # fraction of data to sample
frac_incr = 5 # if not enough data, frac multiply this multiplier to resample
random_seed = 0 
sample = ratings.sample(fraction=frac, seed = random_seed)
min_user = 20000 # minimum required number of users
# 2. keep users by user's num of rating >= 5
cnt_user = 0
while cnt_user < min_user:
    frac *= frac_incr
    sample = ratings.sample(fraction=frac, seed = random_seed)
    id_with_cnt = sample.groupBy("userId").agg(F.count("movieId").alias("cnt"))
    filtered = id_with_cnt.filter(id_with_cnt.cnt >=5)
    cnt_user = filtered.select(F.count("userId")).collect()[0][0]
    print(f'there are {cnt_user} users rated at least 5 movies with frac={frac}')
sample = sample.join(filtered,sample.userId == filtered.userId).drop(filtered.userId).drop(filtered.cnt)
# 3. check movies by genre
movies = movies.withColumn("genre_lst",F.split("genres","\|").alias("genre_lst")) # add a new col of genres.split("|")
with_genre = movies.join(sample,movies.movieId == sample.movieId).drop(sample.movieId)
movie_genre = with_genre.select("movieId",F.explode("genre_lst").alias("genre"))
gp_movie_genre = movie_genre.groupBy("genre").agg(F.countDistinct("movieId").alias("cnt"))

there are 4778 users rated at least 5 movies with frac=0.005
there are 41603 users rated at least 5 movies with frac=0.025


In [5]:
print(f"After sampling there are {sample.select(F.countDistinct(sample.userId)).collect()[0][0]} unique userId")
print(f"After sampling there are {sample.select(F.countDistinct(sample.movieId)).collect()[0][0]} unique movieId")
print("# of movies in each genre: ")
display(gp_movie_genre.toPandas())

After sampling there are 41603 unique userId
After sampling there are 18641 unique movieId
# of movies in each genre: 


Unnamed: 0,genre,cnt
0,Children,1042
1,Crime,2073
2,Fantasy,1161
3,Horror,2006
4,Sci-Fi,1511
5,Adventure,1806
6,Drama,8653
7,IMAX,175
8,Musical,594
9,Western,368
