원본: <a href='https://www.kaggle.com/morrisb/how-to-recommend-anything-deep-recommender'>캐글 노트북</a>

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import os 
os.chdir('/Users/younghun/Desktop/gitrepo/data')

# Load datasets

## Movie-data

In [22]:
# 넷플릭스에 있는 영화들 데이터( 변수이름 없어서 header=None 설정해야 함)
#names로 header 직접 설정
movie_titles = pd.read_csv("./netflix/movie_titles.csv",
                          encoding='ISO-8859-1',
                          header= None,
                          names=['Id','Year','Name']).set_index('Id')
print("Shape of Netflix movie dataset :", movie_titles.shape)
# 5개 데이터 무작위로 샘플링해보기
movie_titles.sample(5)

Shape of Netflix movie dataset : (17770, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
4443,1989.0,City Hunter: .357 Magnum
2193,1960.0,The Wasp Woman/Attack of the Giant Leeches
17733,2002.0,13th Child: Legend of the Jersey Devil
1201,1991.0,True Colors
2917,2002.0,The Ocean's 11 Story


In [17]:
movie_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17770 entries, 1 to 17770
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Year    17763 non-null  float64
 1   Name    17770 non-null  object 
dtypes: float64(1), object(1)
memory usage: 416.5+ KB


In [23]:
# low_memory=True이면 더 적은 메모리를 사용하여 데이터 로드를 하지만 
# 변수별 dtype를 추론해 mixed type을 유발할 수 있음
movie_metadata = pd.read_csv('./movie_dataset/movies_metadata.csv',
                            low_memory=True)


Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.



In [32]:
# 영화 제목, 영화에 대한 개요, vote_count 변수만 추출
movie_metadata = pd.read_csv('./movie_dataset/movies_metadata.csv',
                            low_memory=False)[['original_title',
                                              'overview',
                                              'vote_count']].set_index('original_title').dropna()

In [33]:
# 거의 평가되지 않은 vote_count가 10이하인 long tail부분을 제거하기!
# 그리고 vote_count 변수도 삭제
movie_metadata = movie_metadata[movie_metadata['vote_count'] > 10].drop('vote_count', axis=1)
print("A shape of movie-metadata :", movie_metadata.shape)
movie_metadata.sample(5)

A shape of movie-metadata : (21604, 1)


Unnamed: 0_level_0,overview
original_title,Unnamed: 1_level_1
Wizards,An ambiguous battle between good and evil wage...
Винни-Пух,"With a cheeky, down-to-earth charm that appeal..."
Spartacus,Spartacus is a 1960 American historical drama ...
The Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up Zombies!!?,"Beatnick Jerry takes his girl, Angie to the ca..."
How to Be a Man,When former comedian Mark McCarthy is faced wi...


## User-data

In [40]:
# usecols= list : 불러올 변수(변수 인덱스 or 변수 이름써도 됨)
df_raw = pd.read_csv('./netflix/combined_data_1.txt',
                    header=None,
                    names=['Movie','Rating','Date'],
                    usecols=[0, 1, 2])
df_raw.head(2)

Unnamed: 0,Movie,Rating,Date
0,1488844,3.0,2005-09-06
1,822109,5.0,2005-05-13


In [41]:
# Rating이 비어있는 영화 찾기
tmp_movies = df_raw[df_raw['Rating'].isnull()]['Movie'].reset_index()
tmp_movies.head(3)

Unnamed: 0,index,Movie
0,547,2:
1,693,3:
2,2706,4:


In [42]:
# 영화 indices => [인덱스, 영화]
# movie[:-1] 하는 이유는 colon(:) 빼고 추출하기 위한 것임!
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

In [44]:
# 처음 index를 맨 뒤 index로 보내기.. 왼쪽으로 한 칸 움직이기!
from collections import deque
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)