# EXPLORATION_SBA 4.  뉴스기사 크롤링 및 분류

이번에 활용할 데이터셋은 추천시스템의 MNIST라고 부를만한 Movielens 데이터입니다.
---
* 유저가 영화에 대해 평점을 매긴 데이터가 데이터 크기 별로 있습니다. MovieLens 1M Dataset 사용을 권장합니다.
* 별점 데이터는 대표적인 explicit 데이터입니다. 하지만 implicit 데이터로 간주하고 테스트해볼 수 있습니다.
* 별점을 시청횟수로 해석해서 생각하겠습니다.
* 또한 유저가 3점 미만으로 준 데이터는 선호하지 않는다고 가정하고 제외하겠습니다.

In [None]:
# $ pip install beautifulsoup4
# $ pip install newspaper3k
# $ pip install konlpy

# Import Library (step.01)

In [2]:
import pandas as pd
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.display.max_rows=150
%matplotlib inline
import os

# Load Data (step.02)

In [3]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


# Data Processing (step.03)

In [4]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# rating 컬럼의 이름을 play_count로 바꿉니다.
ratings.rename(columns={'rating':'play_count'}, inplace=True)
print(ratings.shape)
ratings.head()

(836478, 4)


Unnamed: 0,user_id,movie_id,play_count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
print(movies.shape)
movies.head()

(3883, 3)


Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# ratings 와 movies를 합쳐줍니다.
df = pd.merge(ratings, movies, on = 'movie_id') # 'movie_id' 를 기준으로 merge합니다.
print(df.shape)
df.head()

(836478, 6)


Unnamed: 0,user_id,movie_id,play_count,timestamp,title,genre
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama


# EDA (step.02)

In [8]:
# Data Profiling
#profile = df.profile_report()
#profile

### Columns 분석 (step. 2-1)

* user_id: 각 유저의 고유한 id 값입니다. 총 6,039명입니다.
* movie_id: 각 영화의 고유한 id 값입니다. 총 3,628편입니다.
* play_count: 각 영화에 대한 유저의 별점입니다. 이번 프로젝트에는 재생 횟수로 해석하여 사용합니다. 3회 이하의 수는 제외 했으며,   
  4(41.7% / 348,971번), 3(31.2% / 261,196번), 5(27.1% 226,310번) 순으로 분포되어 있습니다.
* timestamp 1970년 1월 1일(Universal Time (UTC) of January 1, 1970) 이후의 시간정보
* title: 각 영화의 제목
* genre: 각 영화의 장르

### 가장 인기있는 영화 순위 (step. 2-2)

In [55]:
top_mv_df = df.pivot_table(index = 'title', values = 'play_count', aggfunc = [np.sum, np.mean])
top_mv_df = top_mv_df.sort_values(by = ('sum','play_count'), ascending=False) # 'play_count', 'sum' 기준으로 내림차순 정렬
top_mv_df.head(100)

Unnamed: 0_level_0,sum,mean
Unnamed: 0_level_1,play_count,play_count
title,Unnamed: 1_level_2,Unnamed: 2_level_2
American Beauty (1999),14449,4.499844
Star Wars: Episode IV - A New Hope (1977),13178,4.528522
Star Wars: Episode V - The Empire Strikes Back (1980),12648,4.384055
Saving Private Ryan (1998),11353,4.431304
Star Wars: Episode VI - Return of the Jedi (1983),11303,4.161635
Raiders of the Lost Ark (1981),11179,4.520421
"Silence of the Lambs, The (1991)",11096,4.441954
"Matrix, The (1999)",10903,4.479458
"Sixth Sense, The (1999)",10703,4.487631
Terminator 2: Judgment Day (1991),10513,4.190116


## Build Recommender System (step. 3)

### 임의의 유저 데이터 만들기 (step. 3-1)

In [16]:
# 선호하는 영화 5편을 선택합니다.
my_favorite = [2028, 318, 356, 1704,1721]

all_movie_name = []
for i in my_favorite:
    movie_name = list(df[df['movie_id'] == i]['title'].unique())
    all_movie_name.append(movie_name)
all_movie_name

[['Saving Private Ryan (1998)'],
 ['Shawshank Redemption, The (1994)'],
 ['Forrest Gump (1994)'],
 ['Good Will Hunting (1997)'],
 ['Titanic (1997)']]

In [56]:
# 선호하는 영화 5편을 선택합니다.
my_favorite = ['Saving Private Ryan (1998)',
               'Shawshank Redemption, The (1994)',
               'Forrest Gump (1994)',
               'Good Will Hunting (1997)',
               'Titanic (1997)']

# 'yg'라는 user_id가 위 영화를 5번씩 봤다고 가정하겠습니다.
my_movielist = pd.DataFrame({'user_id': ['yg']*5,'title':my_favorite,'play_count':[5]*5})

my_movielist

Unnamed: 0,user_id,title,play_count
0,yg,Saving Private Ryan (1998),5
1,yg,"Shawshank Redemption, The (1994)",5
2,yg,Forrest Gump (1994),5
3,yg,Good Will Hunting (1997),5
4,yg,Titanic (1997),5


In [57]:
# my_movielist 와 df 를 합쳐줍니다.
if not df.isin({'user_id':['yg']})['user_id'].any():  # user_id에 yg 이라는 데이터가 없다면
    df = df.append(my_movielist)                 # 위에 임의로 만든 my_movielist 데이터를 추가해 줍니다. 

print(df.shape)
df[df['user_id'] == 'yg']

(836488, 6)


Unnamed: 0,user_id,movie_id,play_count,timestamp,title,genre
0,yg,,5,,Saving Private Ryan (1998),
1,yg,,5,,"Shawshank Redemption, The (1994)",
2,yg,,5,,Forrest Gump (1994),
3,yg,,5,,Good Will Hunting (1997),
4,yg,,5,,Titanic (1997),


### CSR matrix (step. 3-2)

In [61]:
# ratings의 user_id,movie_id에 누락된 값이 있습니다. 몇몇 영화는 아예 play_count에 포함되지 않았기 때문입니다. CSR matrix를 만들 때,
# 'row index exceeds matrix dimensions' 오류가 발생함으로, user_id, movie_id를 초기화하여 다시 값을 부여하겠습니다.

user_idx_to_unique = {v: k for k, v in enumerate(df.user_id.unique())}
movie_idx_to_unique = {v: k for k, v in enumerate(df.title.unique())}

In [63]:
temp_user_data = df.user_id.map(user_idx_to_unique.get).dropna()
temp_movie_data = df.title.map(movie_idx_to_unique.get).dropna()

In [64]:
df.user_id = temp_user_data
df.title = temp_movie_data

In [65]:
from scipy.sparse import csr_matrix
num_user = df['user_id'].nunique()
num_movie = df['title'].nunique()

csr_data = csr_matrix((df.play_count, (df.user_id, df.title)), shape= (num_user, num_movie))
csr_data

<6041x3628 sparse matrix of type '<class 'numpy.longlong'>'
	with 836488 stored elements in Compressed Sparse Row format>

### MF model (step. 3-3)

In [73]:
from implicit.als import AlternatingLeastSquares

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [74]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

# 모델 훈련
als_model.fit(csr_data_transpose)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [80]:
# 훈련된 모델이 'Saving Private Ryan (1998)'을 예측한 나의 선호도를 파악해봅시다.
yg, saving_private_ryan =  user_idx_to_unique['yg'], movie_idx_to_unique['Saving Private Ryan (1998)']
yg_vector, saving_private_ryan_vector = als_model.user_factors[yg], als_model.item_factors[saving_private_ryan]

In [81]:
yg_vector

array([ 0.53412   ,  1.1368464 ,  0.10657179, -0.766816  ,  0.20408298,
        1.0564585 ,  0.7689555 , -0.20662464,  0.1823856 ,  0.07579252,
       -0.2027361 ,  0.30861524, -0.93632376,  0.36955142, -0.65638715,
        0.24887165,  0.4260783 , -0.66236025,  1.475812  ,  0.64696556,
       -0.2607024 , -0.30843422,  0.33657253,  0.5658129 ,  0.3162023 ,
        1.3804235 , -0.96889585, -0.13392273, -0.28375   ,  0.55276686,
       -0.9339614 ,  0.32801872, -0.43483424, -0.12975946,  1.2847053 ,
       -1.0301179 ,  0.4193737 , -0.03725054, -0.33671993,  0.35980877,
       -0.35501775,  0.23515633, -0.01259791, -0.0086758 ,  0.32115313,
       -0.63770556, -0.31199044,  0.37301475,  0.53259695, -0.14632635,
        0.6919062 , -0.88451254,  0.31434467, -0.26409042, -0.6735379 ,
        0.3527047 ,  0.02572737, -0.0114514 ,  0.20213203,  0.14085113,
       -0.55689454,  0.03178289,  0.44960317,  0.49888787,  0.02639334,
        0.54208535, -0.66248226, -0.3852162 , -0.28618234,  0.48

In [82]:
saving_private_ryan_vector

array([-0.00571037,  0.04444493,  0.00577713, -0.00481409,  0.00610755,
        0.05653097,  0.02375943,  0.00217689,  0.00796623,  0.02410106,
       -0.00657601,  0.00428743, -0.02649089,  0.01818253, -0.02119873,
        0.01920876,  0.00852095, -0.02069071,  0.05020994,  0.02597107,
       -0.02428959, -0.00016849,  0.00205183,  0.01351323, -0.00721883,
        0.0186558 , -0.02366879,  0.01508594,  0.00280656,  0.0450938 ,
       -0.00838229,  0.01707692, -0.01662128, -0.00099133,  0.05508611,
       -0.01478952,  0.00911156,  0.02187053, -0.01444149,  0.0043776 ,
        0.00387314,  0.01207224, -0.00379883,  0.00450209,  0.02336314,
        0.01004437, -0.01226184,  0.02602094,  0.05315115, -0.00028927,
        0.03946438, -0.01937563, -0.00640536,  0.00574657,  0.00333908,
        0.00542469, -0.02101476,  0.02713138,  0.00160019,  0.03016699,
        0.00303837,  0.01541413, -0.02361482, -0.00558335, -0.0020444 ,
       -0.00316823, -0.0320195 ,  0.01619278, -0.01843677,  0.02

In [83]:
# yg와 'Saving Private Ryan (1998)'를 내적하는 코드
np.dot(yg_vector, saving_private_ryan_vector) # 0.80가 나왔습니다.

0.80158806

In [84]:
# 훈련된 모델이 'Toy Story (1995)'을 예측한 나의 선호도를 파악해봅시다.
yg, toy_story =  user_idx_to_unique['yg'], movie_idx_to_unique['Toy Story (1995)']
yg_vector, toy_story_vector = als_model.user_factors[yg], als_model.item_factors[toy_story]

In [85]:
# yg와 'Toy Story (1995)''를 내적하는 코드
np.dot(yg_vector, toy_story_vector) # 0.21이 나왔습니다.

0.15585774

### 선호하는 영화와 비슷한 영화 추천하기 (step. 3-4)

In [86]:
# 'Saving Private Ryan (1998)' 와 유사한 영화를 추천해보겠습니다.
favorite_movie = movie_idx_to_unique['Saving Private Ryan (1998)']
similar_movie = als_model.similar_items(favorite_movie, N=15)
similar_movie

[(48, 1.0000001),
 (23, 0.6729565),
 (87, 0.663535),
 (157, 0.50173163),
 (248, 0.44217122),
 (487, 0.42951497),
 (141, 0.42041236),
 (3499, 0.4197104),
 (121, 0.38417384),
 (99, 0.36131784),
 (238, 0.36055714),
 (160, 0.33884883),
 (3297, 0.3383508),
 (269, 0.33691177),
 (124, 0.3249224)]

In [88]:
# movie_id와 title 매칭하기
idx_to_movie = {v:k for k,v in movie_idx_to_unique.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['Saving Private Ryan (1998)',
 "Schindler's List (1993)",
 'Braveheart (1995)',
 'Shawshank Redemption, The (1994)',
 'Good Will Hunting (1997)',
 'Boat, The (Das Boot) (1981)',
 'Fugitive, The (1993)',
 'Simon Sez (1999)',
 'Silence of the Lambs, The (1991)',
 'American Beauty (1999)',
 'Thin Red Line, The (1998)',
 'Forrest Gump (1994)',
 'Germinal (1993)',
 'GoodFellas (1990)',
 'Matrix, The (1999)']

### 'yg' 유저가 가장 좋아할 만한 영화들 추천 하기(step. 3-5)

In [89]:
user = user_idx_to_unique['yg']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

# movie_id와 title 매칭하기
idx_to_movie = {v:k for k,v in movie_idx_to_unique.items()}
[idx_to_movie[i[0]] for i in movie_recommended]

["Schindler's List (1993)",
 'Silence of the Lambs, The (1991)',
 'Braveheart (1995)',
 'Apollo 13 (1995)',
 'Jerry Maguire (1996)',
 'Pulp Fiction (1994)',
 'American Beauty (1999)',
 'GoodFellas (1990)',
 'Groundhog Day (1993)',
 'Rain Man (1988)',
 'Back to the Future (1985)',
 'Fargo (1996)',
 'Sixth Sense, The (1999)',
 'As Good As It Gets (1997)',
 'Dead Man Walking (1995)',
 'Truman Show, The (1998)',
 'Gone with the Wind (1939)',
 'Sling Blade (1996)',
 "You've Got Mail (1998)",
 'Dances with Wolves (1990)']

In [91]:
# 'Saving Private Ryan (1998)'이 추천에 기여한 정도 알아보기
explain = als_model.explain(user, csr_data, itemid=saving_private_ryan)
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('Saving Private Ryan (1998)', 0.4585032710841496),
 ('Shawshank Redemption, The (1994)', 0.09838169730606845),
 ('Good Will Hunting (1997)', 0.09628851030521124),
 ('Forrest Gump (1994)', 0.06913824212297956),
 ('Titanic (1997)', 0.06328307803759764)]