In [1]:
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS

Intitializing Scala interpreter ...

Spark Web UI available at http://localhost:4041
SparkContext available as 'sc' (version = 2.4.6, master = local[*], app id = local-1600666027119)
SparkSession available as 'spark'


import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS


In [2]:
//텍스트 파일 읽기
val rawUserArtistData = spark.read.
    option("inferSchema","true").
    option("sep"," ").
    option("header","false").
    csv("Data/Practice3/user_artist_data.txt")

rawUserArtistData: org.apache.spark.sql.DataFrame = [_c0: int, _c1: int ... 1 more field]


In [3]:
//Data frame 만들기
val userArtistDF = rawUserArtistData.toDF("user","artist","count")

userArtistDF: org.apache.spark.sql.DataFrame = [user: int, artist: int ... 1 more field]


In [4]:
//샘플링
val smallUserArtistDF = userArtistDF.sample(false,0.001)

smallUserArtistDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user: int, artist: int ... 1 more field]


In [5]:
smallUserArtistDF.show()

+-------+-------+-----+
|   user| artist|count|
+-------+-------+-----+
|1000019|1003387|    1|
|1000033|    228|    3|
|1000039|1002748|    6|
|1000060|   1270|    6|
|1000072|1005015|    1|
|1000077|1004014|    3|
|1000079|   1158|    1|
|1000083|1003447|    1|
|1000084|1004360|    5|
|1000096|1058890|    3|
|1000098|1002469|    4|
|1000100|   5665|    2|
|1000104|1007119|    1|
|1000107|     58|    1|
|1000109|1018173|    1|
|1000111|1056491|    2|
|1000112|1007790|   18|
|1000116|1079768|   13|
|1000117|1002291|   57|
|1000127|1018361|    1|
+-------+-------+-----+
only showing top 20 rows



In [6]:
smallUserArtistDF.printSchema()

root
 |-- user: integer (nullable = true)
 |-- artist: integer (nullable = true)
 |-- count: integer (nullable = true)



In [7]:
//트레이닝데이터와 테스트 데이터로 나눔
val Array(training, test) = smallUserArtistDF.randomSplit(Array(0.8,0.2))

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user: int, artist: int ... 1 more field]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user: int, artist: int ... 1 more field]


In [8]:
training.count()

res2: Long = 19482


In [9]:
test.count()

res3: Long = 4672


In [10]:
//트레이닝, 테스트 데이터의 스키마가 같다
training.printSchema()

root
 |-- user: integer (nullable = true)
 |-- artist: integer (nullable = true)
 |-- count: integer (nullable = true)



In [11]:
test.printSchema()

root
 |-- user: integer (nullable = true)
 |-- artist: integer (nullable = true)
 |-- count: integer (nullable = true)



In [12]:
//ALS 알고리즘 사용
val als = new ALS().
            setMaxIter(10).
            setRegParam(0.01).
            setUserCol("user").
            setItemCol("artist").
            setRatingCol("count")

als: org.apache.spark.ml.recommendation.ALS = als_2a31560cca5c


In [13]:
//모델 학습 (fit 사용)
val model = als.fit(training)

model: org.apache.spark.ml.recommendation.ALSModel = als_2a31560cca5c


In [14]:
//NaN값 나오지 않게 하기
model.setColdStartStrategy("drop")

res6: model.type = als_2a31560cca5c


In [15]:
//학습된 모델로 test 데이터 검증 (transform 사용)
val predictions = model.transform(test)

predictions: org.apache.spark.sql.DataFrame = [user: int, artist: int ... 2 more fields]


In [16]:
//테스트 결과
predictions.sort("user").show(1000)

+-------+--------+-----+-------------+
|   user|  artist|count|   prediction|
+-------+--------+-----+-------------+
|    629|     581|   21|     9.285477|
|   7275| 1037527|    1|   0.16125391|
|   8981|    1967|   12|    -5.245662|
|1000166| 1006867|   24|    1.5905558|
|1000611| 1052770|    6|   -0.3266813|
|1001168| 1003557|    2|  -0.24235153|
|1001843|     606|    1|   -13.615395|
|1001919|    4225|    1|     1.395536|
|1001919|    2430|   61|     1.563112|
|1001948| 1025477|    1|  0.015595555|
|1002213| 1008286|    2|   0.05957881|
|1002213| 1085462|    1| -0.045264512|
|1002624| 1006229|   95|   0.28858152|
|1002702| 1001529|    5|    -13.34993|
|1003006| 1003673|    1|   -2.6450448|
|1003147|    1312|    9|   -1.1817982|
|1003183|    1400|    2|    15.144583|
|1003303| 1080742|   30|   -6.5099373|
|1003451|    1319|   74|    11.812155|
|1003642| 1003356|    4|   -3.9695845|
|1003798| 1003430|    2|    -4.971599|
|1004073| 1000428|    1|   0.43636173|
|1004341| 2003588|    1| 

|2174191| 1298659|    8|   -0.9376649|
|2174491| 1207868|    1|  -0.08675782|
|2175419|    4257|    1|    -4.116563|
|2176461| 1004623|    3|   -3.9308667|
|2177837| 1015580|    6|  0.017440302|
|2177924| 1000557|    9|  -0.42004892|
|2178573|    1278|    1|    1.9631386|
|2180290| 1001909|    2|     39.02305|
|2180622| 1001300|    2|   0.10501088|
|2180784| 2104089|    1|   0.15778723|
|2185222| 1004028|    1|   -1.9450617|
|2185222| 1007614|    1| -0.050370023|
|2185222| 1263154|    1|  -0.03967783|
|2185378| 1008688|   66|   0.09190286|
|2187673| 1006160|    6|    -25.56381|
|2188103| 1000123|   25|    5.6505055|
|2188925| 1000487|    2|   -16.041414|
|2191534| 1000931|    1|   -2.4588478|
|2191736| 6730533|    1|  -0.20029432|
|2193433| 1041765|    5|   -3.5261073|
|2193487| 1004372|    3|   -1.1591077|
|2195400| 1233610|    1|  -0.04757216|
|2195640| 1014632|   16|  0.024607299|
|2196333| 1007398|   11|   0.17472205|
|2198256| 1000591|    1|    15.889714|
|2199314| 1024202|    1| 

|2394275|     221|    1|    4.1676583|
|2395156| 1004122|    1|    -5.938015|
|2396123| 1037556|    2|     6.740632|
|2396813| 2003588|    4|   -2.0280666|
|2396865| 1101276|   17|     9.563113|
|2399799| 1000031|    1|   -4.9913287|
|2403099| 1002621|    1| -0.099237435|
|2405052|    1410|    1|   0.72573984|
|2405492| 1015116|    2|     4.309177|
|2405716| 1001129|    8|     3.036231|
|2409247|     223|   23|    13.617811|
|2413827|    5810|   17|    0.5297937|
|2416462| 1236878|    1|    0.8107065|
|2417421|    1312|    7|    0.2344199|
|2417571| 1014421|   11|     7.791129|
|2420301| 1103075|    2|    25.810171|
|2421208| 1238128|    5|  0.026873887|
|2421298| 1001339|    1| -0.036271498|
+-------+--------+-----+-------------+

