# 잠재요인기반 추천시스템 ALS 모델 만들기 

In [1]:
from pyspark.sql import SparkSession

#  Spark 세션 생성
MAX_MEMORY='5g'
spark = SparkSession.builder.appName("241212_01_MLlib_als")\
            .config("spark.executor.memory", MAX_MEMORY) \
            .config("spark.driver.memory", MAX_MEMORY) \
            .getOrCreate()

24/12/16 11:46:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# data 적재 

In [2]:
rating_df = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('data/ratings.csv')

                                                                                

In [11]:
rating_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [12]:
rating_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



# 전처리 
- timestamp 제거 

In [3]:
rating_df= rating_df.select(["userId", "movieId", "rating"])

# 데이터 세트 분할

In [4]:
train_ratio = 0.8
test_ratio=0.2

train_df, test_df= rating_df.randomSplit([train_ratio,test_ratio], seed=42 )

# ALS 모델 객체 생성 

In [5]:
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

# 모델의 학습 

In [6]:
als_model= als.fit(train_df)

24/12/16 11:48:02 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/16 11:48:02 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/12/16 11:48:03 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/12/16 11:48:03 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
                                                                                

In [None]:
#  예측 확인 

In [7]:

#userid, movieid >rating
predictions= als_model.transform(test_df)
predictions.show()

                                                                                

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 26480|    148|   2.0| 1.9323642|
|151614|    148|   1.0| 2.7285917|
| 28229|    148|   1.0| 2.4264994|
|  6491|    148|   4.0|  2.457274|
| 14831|    148|   3.0| 2.6998045|
|145182|    148|   3.0| 2.5584338|
| 69123|    148|   4.5|  2.265552|
|104825|    148|   4.0| 3.0939217|
| 41703|    148|   2.0| 2.8506374|
| 29213|    148|   5.0|  2.224452|
| 75209|    148|   2.0| 2.5537493|
|115912|    148|   3.0| 2.4819057|
|118261|    148|   3.0| 3.0280035|
|138552|    148|   4.0|   3.39528|
| 70733|    148|   1.0|  2.884383|
|115095|    148|   4.0| 3.3555832|
|  7223|    148|   3.0| 2.5141237|
| 65981|    148|   3.5|  2.942538|
| 74794|    148|   3.0| 2.3509142|
| 33145|    148|   5.0| 2.9147615|
+------+-------+------+----------+
only showing top 20 rows



In [8]:
predictions.select("rating", "prediction").describe().show()



+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4998109|           4998109|
|   mean|3.5341648211353536|3.3989586474738536|
| stddev|1.0609230261741123|0.6359828621868081|
|    min|               0.5|        -2.1170578|
|    max|               5.0|         6.4117255|
+-------+------------------+------------------+



                                                                                

# ALS 모델의 하이퍼파라미터 조정¶
- coldStartStrategy = 'drop' > 예측에 충분한지 않은 컬럼의 데이터의 경우 'drop'

- 학습한 모델의 속성을 변경 > 다시 예측을 한 경우에도 조정이 안됨 > 학습부터 다시 실행

In [9]:
als.setColdStartStrategy('drop') #데이터가 많지 않을 때 사용하는 전략 

ALS_3bf19907491e

In [10]:
#userid, movieid > rating 
predictions = als_model.transform(test_df)
predictions.show(5)



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 26480|    148|   2.0| 1.9323642|
|151614|    148|   1.0| 2.7285917|
| 28229|    148|   1.0| 2.4264994|
|  6491|    148|   4.0|  2.457274|
| 14831|    148|   3.0| 2.6998045|
+------+-------+------+----------+
only showing top 5 rows



                                                                                

In [11]:
predictions.select("rating", "prediction").describe().show()



+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4998109|           4998109|
|   mean|3.5341648211353536|3.3989586474738536|
| stddev|1.0609230261741123|0.6359828621868081|
|    min|               0.5|        -2.1170578|
|    max|               5.0|         6.4117255|
+-------+------------------+------------------+



                                                                                

# 평가 

RMSE 측정 

In [12]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [13]:
rmse = evaluator.evaluate(predictions)
rmse

                                                                                

0.8132955975149425

#  활용

- 1.사용자 id > 추천목록
- 2. 영화 id> 사용자 목록 

In [59]:
als_model.getRecommentForAllUsers(3) # 사용자를 위한 추천을 만들어줌#실행x

AttributeError: 'ALSModel' object has no attribute 'getRecommentForAllUsers'

In [None]:
als_model.recommandForAllItems(3) # item별로 추천할 유저 3명을 골라준다

In [47]:
from pyspark.sql.types import IntegerType

In [60]:
user_list = [65, 78, 81]
user_df = spark.createDataFrame(user_list, IntegerType()).toDF("userId")
user_df.show()

+------+
|userId|
+------+
|    65|
|    78|
|    81|
+------+



In [49]:
user_recommend_movies= als_model.recommendForUserSubset(user_df,3 )
user_recommend_movies.show()

                                                                                

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    65|[{194434, 7.21672...|
|    78|[{203086, 7.02569...|
|    81|[{197433, 4.82499...|
+------+--------------------+



In [None]:
# movie-list load

In [61]:
movie_file = 'data/movies.csv'
movies_df = spark.read.csv(movie_file, inferSchema=True, header=True)

In [62]:
movies_df.show(3)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



In [None]:
#65번 사용자의 추천영화목록 

In [71]:
movies_list = user_recommend_movies.collect()[0].recommendations
movies_list



[Row(movieId=194434, rating=7.216722011566162),
 Row(movieId=205277, rating=6.863764762878418),
 Row(movieId=177209, rating=6.7588629722595215)]

In [72]:
rec_df= spark.createDataFrame(movies_list)
rec_df.show()

+-------+------------------+
|movieId|            rating|
+-------+------------------+
| 194434| 7.216722011566162|
| 205277| 6.863764762878418|
| 177209|6.7588629722595215|
+-------+------------------+



In [73]:
# 영화정보와 추천영화 목록을 조인 

In [74]:
rec_df.createOrReplaceTempView('recommend')
movies_df.createOrReplaceTempView('movies')

In [78]:
query= '''
select *
from movies
join recommend on movies.movieId = recommend.movieId
ORDER by rating desc
'''

In [79]:
recommend_movies = spark.sql(query)

In [80]:
recommend_movies.show()

+-------+-----------------+--------------------+-------+------------------+
|movieId|            title|              genres|movieId|            rating|
+-------+-----------------+--------------------+-------+------------------+
| 194434|Adrenaline (1990)|  (no genres listed)| 194434| 7.216722011566162|
| 205277|Inside Out (1991)|Comedy|Drama|Romance| 205277| 6.863764762878418|
| 177209|   Acı Aşk (2009)|               Drama| 177209|6.7588629722595215|
+-------+-----------------+--------------------+-------+------------------+



In [None]:
# 택시비 예측 모델 만들기 

In [81]:
spark.stop()