**1. Setup**

In [1]:
import os

# Đặt biến môi trường cho Spark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/lib/spark3.5/"

In [2]:
import sys
print(sys.executable)

/usr/bin/python3


In [3]:
os.environ["SPARK_HOME"] 

'/usr/lib/spark3.5/'

In [1]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import functions
from pyspark import SparkConf
from pyspark.context import SparkContext

spark = SparkSession.builder \
    .appName("Project") \
    .master("spark://10.10.28.60:7077") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "2g") \
    .config("spark.local.dir", "/tmp/spark-temp") \
    .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
    # .config("spark.driver.memory", "2g") \
    # .config("spark.executor.memory", "2g") \
    
print(spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/27 09:16:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/27 09:16:40 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


3.5.1




In [2]:
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF
from operator import add
from pyspark.ml import Pipeline, PipelineModel


from pyspark.sql.functions import *

from pyspark.sql.types import *

import pandas as pd

**2. Data Preparation**




In [3]:
movies = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/movies.parquet')
ratings = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/ratings.parquet')
tags = spark.read.parquet('hdfs://master5:9000/user/dis/movielens/tags.parquet')

                                                                                

In [4]:
df_movies = movies
df_ratings = ratings
df_tags = tags
df_movies.createOrReplaceTempView("movies")
df_ratings.createOrReplaceTempView("ratings")
df_tags.createOrReplaceTempView("tags")

In [5]:
tags_text = spark.sql("SELECT movieId, CONCAT(REPLACE(TRIM(tag), ' ', ''),' ') as tag FROM tags")

#Group by tag by movieId and sort
tags_text_rdd = tags_text.rdd
tags_by_movie_rdd = tags_text_rdd.map(tuple).reduceByKey(add)
tags_by_movie_df = spark.createDataFrame(tags_by_movie_rdd).orderBy("_1", ascending = True)

tags_by_movie_df = tags_by_movie_df  \
                            .withColumnRenamed('_1', 'movie_id') \
                            .withColumnRenamed('_2', 'tag')


                                                                                

In [6]:
model_path = 'hdfs://master5:9000/user/dis/output-3'
output_path = 'hdfs://master5:9000/user/dis/output-11'

In [13]:
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'tag', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")
word2Vec = Word2Vec(vectorSize = 50, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec', seed=123)
vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF, word2Vec, vectorAssembler])

pipeline_mdl = pipeline.fit(tags_by_movie_df)

#save the pipeline model
pipeline_mdl.write().overwrite().save(model_path + 'pipe_txt')

24/05/24 02:08:39 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
24/05/24 02:08:55 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/24 02:09:31 WARN TaskSetManager: Stage 48 contains a task of very large size (3468 KiB). The maximum recommended task size is 1000 KiB.
24/05/24 02:09:32 WARN TaskSetManager: Stage 52 contains a task of very large size (2229 KiB). The maximum recommended task size is 1000 KiB.
24/05/24 02:09:34 WARN TaskSetManager: Stage 56 contains a task of very large size (9191 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [7]:
pipeline_mdl = PipelineModel.load(model_path + 'pipe_txt')
tags_by_movie_trf_df = pipeline_mdl.transform(tags_by_movie_df)

                                                                                

In [8]:
import numpy as np

def CosineSim(vec1, vec2):
    
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0

    return dot_product / (norm_vec1 * norm_vec2)

In [9]:
all_movieId_vecs = tags_by_movie_trf_df.select('movie_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
# all_movieId_vecs[0]

                                                                                

In [13]:
all_movieId_vecs[0]

(1,
 DenseVector([0.1728, 0.1982, 0.0864, 0.215, -0.1336, 0.0547, -0.0585, 0.0741, 0.1941, 0.0093, 0.3442, 0.1297, -0.1513, -0.1411, -0.1685, -0.0186, 0.153, -0.1117, 0.0932, -0.0377, 0.2163, 0.3923, -0.0634, -0.2124, -0.2984, 0.0937, -0.3134, -0.1444, -0.0995, 0.0354, 0.1659, 0.1489, 0.0175, -0.1534, -0.0635, 0.0625, -0.1313, 0.1057, 0.0681, -0.0485, -0.1255, 0.0664, -0.095, -0.1981, -0.0304, -0.0293, 0.0883, -0.0898, 0.1826, 0.1435]))

In [10]:
import time
def getSimilarMovies(m_id, sim_mos_limit=5):
    schema = StructType([
                            StructField("movie_id", IntegerType(), True)
                            ,StructField("score", IntegerType(), True)
                            ,StructField("input_movie_id", StringType(), True)
                        ])

    similar_movies_df = spark.createDataFrame([], schema)
    print(m_id)
    m_id = int(m_id)
    input_vec = tags_by_movie_trf_df.select('word_vec')\
                .filter(tags_by_movie_trf_df['movie_id'] == m_id)\
                .collect()[0][0]
    start_time = time.time()
    similar_movie_rdd = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_movieId_vecs)
    print ('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))
    similar_movie_df = spark.createDataFrame(similar_movie_rdd) \
            .withColumnRenamed('_1', 'movie_id') \
            .withColumnRenamed('_2', 'score') \
            .orderBy("score", ascending = False)

    similar_movie_df = similar_movie_df.filter(col("movie_id") != m_id).limit(sim_mos_limit)
    similar_movie_df = similar_movie_df.withColumn('input_movie_id', lit(m_id))

    similar_movies_df = similar_movies_df \
                                .union(similar_movie_df)

    return similar_movies_df

In [11]:
def getMovieDetails(in_mos):

    a = in_mos.alias("a")
    b = df_movies.alias("b")

    return a.join(b, col("a.movie_id") == col("b.movieId"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.title'),col('b.genres')])


In [12]:
random_10_movies = tags_by_movie_trf_df.select("movie_id").limit(5).collect()
for row in random_10_movies:
    movie_id = row.movie_id
    print(movie_id)

1
2
3
4
5


In [14]:
mids = []
for row in random_10_movies:
    mids.append(str(row.movie_id))
print(mids)
for i in range(0,len(mids)):
    print('\ninput movies details:')
    df_movies.select('movieId', 'title', 'genres') \
        .filter(df_movies.movieId == mids[i]).show(truncate=False)
    try:
        sims = getMovieDetails(getSimilarMovies(mids[i]))
        print(f'Top 5 similar movies for {mids[i]} each input movies are:"')
        display(sims.select('input_movie_id', 'movie_id', 'title', 'score').toPandas())
    except IndexError as e:
        print(f"Error processing movie ID {mids[i]}: {e}")

['1', '2', '3', '4', '5']

input movies details:


ConnectionRefusedError: [Errno 111] Connection refused

In [25]:
def getContentRecoms(u_id, sim_mos_limit=5):

    # select movies having rating >= 3
    query = """
    SELECT distinct movieId as movie_id FROM ratings
    where rating >= 3.0
    and userId = "{}"
    """.format(u_id)

    usr_rev_mos = sqlContext.sql(query)

    # from these get sample of 5 movies
    usr_rev_mos = usr_rev_mos.sample(False, 0.5).limit(5)

    usr_rev_mos_det = getMovieDetails(usr_rev_mos)

    # show the sample details
    print('\nMovies previously reviewed by user {}'.format(u_id))
    usr_rev_mos_det.select(['movie_id', 'title', 'genres']).show(truncate = False)

    mos_list = [i.movie_id for i in usr_rev_mos.collect()]

    # get movies similar to a list
    sim_mos_dfs = []
    for i in mos_list:
     sim_mos_df = getSimilarMovies(i, sim_mos_limit)
     sim_mos_dfs.append(sim_mos_df)

     # Change to a DataFrame  
    sim_mos_df = sim_mos_dfs[0]
    for i in range(1, len(sim_mos_dfs)):
     sim_mos_df = sim_mos_df.union(sim_mos_dfs[i])

    # filter out those have been reviewd before by the user
    a = sim_mos_df.alias("a")
    b = usr_rev_mos.alias("b")
    c = a.join(b, col("a.movie_id") == col("b.movie_id"), 'left_outer') \
         .where(col("b.movie_id").isNull()) \
         .select([col('a.movie_id'),col('a.score')]).orderBy("a.score", ascending = False)

    x = c.limit(sim_mos_limit)

    return getMovieDetails(x)

In [26]:
getContentRecoms(3).toPandas()


Movies previously reviewed by user 3
+--------+---------------------------------------------------+----------------------------------+
|movie_id|title                                              |genres                            |
+--------+---------------------------------------------------+----------------------------------+
|296     |Pulp Fiction (1994)                                |Comedy|Crime|Drama|Thriller       |
|858     |Godfather, The (1972)                              |Crime|Drama                       |
|68954   |Up (2009)                                          |Adventure|Animation|Children|Drama|
|81834   |Harry Potter and the Deathly Hallows: Part 1 (2010)|Action|Adventure|Fantasy|IMAX     |
|187593  |Deadpool 2 (2018)                                  |Action|Comedy|Sci-Fi              |
+--------+---------------------------------------------------+----------------------------------+

[858, 296, 68954, 187593, 81834]


TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'