In [1]:
# ! pip install rake_nltk

import pandas as pd
import numpy as np
from rake_nltk import Rake #import rake algorithm 提取關鍵字

from sklearn.metrics.pairwise import cosine_similarity # 計算相似程度會用到
from sklearn.feature_extraction.text import CountVectorizer # 將特徵轉成向量，模型讀的是向量

from ast import literal_eval # 將str轉成 list

In [2]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

In [3]:
credits.columns = ['id', 'title', 'cast', 'crew']

In [4]:
alldata = movies.merge(credits, on='id')

## alldata中需要的欄位取出來存成dataframe x

In [5]:
x = alldata[['id', 'original_title', 'genres', 'keywords', 'overview', 'original_language', 'cast', 'crew']]

In [6]:
x.head(1)

Unnamed: 0,id,original_title,genres,keywords,overview,original_language,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",en,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# 一. 資料清洗

## 將全部的特徵欄位從str轉回原本的list型態
## 用apply方便

In [7]:
# 要觀察的特徵欄位
features = ['keywords', 'genres', 'cast', 'crew']
for feature in features:
    x[feature] = alldata[feature].apply(literal_eval)
    # 全部變成 list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[feature] = alldata[feature].apply(literal_eval)


## 省略錯誤訊息

In [8]:
import warnings
warnings.filterwarnings('ignore')

## Extract list of genres

In [9]:
def list_genres(x):
    l = [d["name"] for d in x]
    return(l)

x['genres'] = x['genres'].apply(list_genres)

## Extract top 3 cast members 演員
### 找前三位主要演員

In [10]:
def list_cast(x):
    l = [d["name"] for d in x]
    if len(l) > 3:
        l = l[0:3]
    return(l)

# test = ['1', '2', '3', '4', '5']
# test[0:3] 取3個 0 1 2
x['cast'] = x['cast'].apply(list_cast)

## Extract top 5 keywords
### 找出前5個關鍵字

In [11]:
def list_keywords(x):
    l = [d['name'] for d in x]
    if len(l)>5:
        l = l[0:5]
    return(l)
x['keywords'] = x['keywords'].apply(list_keywords)

## Extract director from crew

In [12]:
x['crew'][2]

[{'credit_id': '54805967c3a36829b5002c41',
  'department': 'Sound',
  'gender': 2,
  'id': 153,
  'job': 'Original Music Composer',
  'name': 'Thomas Newman'},
 {'credit_id': '52fe4d22c3a368484e1d8d77',
  'department': 'Directing',
  'gender': 2,
  'id': 39,
  'job': 'Director',
  'name': 'Sam Mendes'},
 {'credit_id': '5677cd99925141691a005333',
  'department': 'Art',
  'gender': 1,
  'id': 8384,
  'job': 'Set Decoration',
  'name': 'Anna Pinnock'},
 {'credit_id': '52fe4d22c3a368484e1d8d8d',
  'department': 'Writing',
  'gender': 2,
  'id': 932,
  'job': 'Screenplay',
  'name': 'John Logan'},
 {'credit_id': '562fc9bb92514129fe0006b4',
  'department': 'Writing',
  'gender': 2,
  'id': 932,
  'job': 'Story',
  'name': 'John Logan'},
 {'credit_id': '5654d41c92514145c3000373',
  'department': 'Art',
  'gender': 2,
  'id': 4248,
  'job': 'Production Design',
  'name': 'Dennis Gassner'},
 {'credit_id': '52fe4d22c3a368484e1d8d71',
  'department': 'Writing',
  'gender': 2,
  'id': 9856,
  'job

In [13]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
x['director'] = x['crew'].apply(get_director)

## drop the other column crew 

In [14]:
x = x.drop('crew', axis=1)

In [15]:
x.head(5)

Unnamed: 0,id,original_title,genres,keywords,overview,original_language,cast,director
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",en,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",en,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi6]",A cryptic message from Bond’s past sends him o...,en,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,en,"[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...",en,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton


## Clean features of spaces and lowercase all to ensure uniques


In [16]:
features = ['keywords', 'genres', 'cast', 'director']

In [17]:
def clean_feat(x):
    if isinstance(x, list):
        return [i.lower().replace(' ','') for i in x]
    else:
        if isinstance(x, str):
            return x.lower().replace(" ", "")
        else:
            return ''
        
for i in features:
    x[i] = x[i].apply(clean_feat)

In [18]:
x.head(1)

Unnamed: 0,id,original_title,genres,keywords,overview,original_language,cast,director
0,19995,Avatar,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","In the 22nd century, a paraplegic Marine is di...",en,"[samworthington, zoesaldana, sigourneyweaver]",jamescameron


In [19]:
x['genres']

0       [action, adventure, fantasy, sciencefiction]
1                       [adventure, fantasy, action]
2                         [action, adventure, crime]
3                   [action, crime, drama, thriller]
4                [action, adventure, sciencefiction]
                            ...                     
4798                       [action, crime, thriller]
4799                               [comedy, romance]
4800               [comedy, drama, romance, tvmovie]
4801                                              []
4802                                   [documentary]
Name: genres, Length: 4803, dtype: object

In [20]:
x['genres']

0       [action, adventure, fantasy, sciencefiction]
1                       [adventure, fantasy, action]
2                         [action, adventure, crime]
3                   [action, crime, drama, thriller]
4                [action, adventure, sciencefiction]
                            ...                     
4798                       [action, crime, thriller]
4799                               [comedy, romance]
4800               [comedy, drama, romance, tvmovie]
4801                                              []
4802                                   [documentary]
Name: genres, Length: 4803, dtype: object

In [21]:
x['cast']

0        [samworthington, zoesaldana, sigourneyweaver]
1           [johnnydepp, orlandobloom, keiraknightley]
2            [danielcraig, christophwaltz, léaseydoux]
3            [christianbale, michaelcaine, garyoldman]
4          [taylorkitsch, lynncollins, samanthamorton]
                             ...                      
4798    [carlosgallardo, jaimedehoyos, petermarquardt]
4799         [edwardburns, kerrybishé, marshadietlein]
4800           [ericmabius, kristinbooth, crystallowe]
4801            [danielhenney, elizacoupe, billpaxton]
4802    [drewbarrymore, brianherzlinger, coreyfeldman]
Name: cast, Length: 4803, dtype: object

In [22]:
x['director']

0           jamescameron
1          goreverbinski
2              sammendes
3       christophernolan
4          andrewstanton
              ...       
4798     robertrodriguez
4799         edwardburns
4800          scottsmith
4801          danielhsia
4802     brianherzlinger
Name: director, Length: 4803, dtype: object

In [23]:
# aa = [{'credit_id': '54805967c3a36829b5002c41',
#   'department': 'Sound',
#   'gender': 2,
#   'id': 153,
#   'job': 'Original Music Composer',
#   'name': 'Thomas Newman'},
#  {'credit_id': '52fe4d22c3a368484e1d8d77',
#   'department': 'Directing',
#   'gender': 2,
#   'id': 39,
#   'job': 'Director',
#   'name': 'Sam Mendes'}]

In [24]:
# for x in aa:
#     print(x['job'])
#     print(type(x))
#     print('==')

## 處理遺漏值

In [25]:
missing = x.columns[x.isnull().any()]

In [26]:
#使用isnull方法計算每個欄位的缺值
#並且用sum方法來做每個欄位總缺值數量的計算
#此方式與info相反，info計算無缺值的數目，此方法計算缺值數目
x.isnull()

Unnamed: 0,id,original_title,genres,keywords,overview,original_language,cast,director
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
4798,False,False,False,False,False,False,False,False
4799,False,False,False,False,False,False,False,False
4800,False,False,False,False,False,False,False,False
4801,False,False,False,False,False,False,False,False


In [27]:
x.isnull().sum().to_frame()

Unnamed: 0,0
id,0
original_title,0
genres,0
keywords,0
overview,3
original_language,0
cast,0
director,0


In [28]:
x['overview'] = x['overview'].fillna('')
# 用空字串''填補遺漏值
# https://ithelp.ithome.com.tw/articles/10201106

In [29]:
x.isnull().sum().to_frame()

Unnamed: 0,0
id,0
original_title,0
genres,0
keywords,0
overview,0
original_language,0
cast,0
director,0


# 二. Rake Algorithm 提取關鍵字

In [30]:
# 創建一個空的plotwords欄位 
x['plotwords'] = ''

In [31]:
def get_keywords(x):
    plot = x
    rake = Rake()
    rake.extract_keywords_from_text(plot)
    scores = rake.get_word_degrees()
    return(list(scores.keys()))

x['plotwords'] = x['overview'].apply(get_keywords)

In [32]:
x['plotwords']

0       [moon, pandora, paraplegic, marine, following,...
1       [headed, quite, long, believed, come, back, el...
2       [secret, service, alive, layers, bond, ’, unco...
3       [batman, assumes, responsibility, encounters, ...
4       [epic, conflict, former, military, captain, su...
                              ...                        
4798    [carry, chase, el, mariachi, town, carries, gu...
4799    [newlywed, couple, honeymoon, upended, respect...
4800    [delivered, mail, detectives, determination, d...
4801    [save, career, legal, mess, help, wonders, bea...
4802    [drew, barrymore, old, aspiring, filmmaker, as...
Name: plotwords, Length: 4803, dtype: object

In [33]:
# scores 的樣子
# scores.keys() => moon pandora alien civilization ... mission

# {'dispatched': 1,
#              'moon': 2,
#              'pandora': 2,
#              'alien': 2,
#              'civilization': 2,
#              'paraplegic': 2,
#              'marine': 2,
#              '22nd': 2,
#              'century': 2,
#              'following': 2,
#              'orders': 2,
#              'becomes': 2,
#              'torn': 2,
#              'protecting': 1,
#              'unique': 2,
#              'mission': 2})


# plot = df['overview'][0]
# rake = Rake()
# rake.extract_keywords_from_text(plot)
# scores = rake.get_word_degrees()
# scores
# scores.keys()
# scores.values()

# https://ithelp.ithome.com.tw/articles/10205081 字典用法

In [34]:
x.head()

Unnamed: 0,id,original_title,genres,keywords,overview,original_language,cast,director,plotwords
0,19995,Avatar,"[action, adventure, fantasy, sciencefiction]","[cultureclash, future, spacewar, spacecolony, ...","In the 22nd century, a paraplegic Marine is di...",en,"[samworthington, zoesaldana, sigourneyweaver]",jamescameron,"[moon, pandora, paraplegic, marine, following,..."
1,285,Pirates of the Caribbean: At World's End,"[adventure, fantasy, action]","[ocean, drugabuse, exoticisland, eastindiatrad...","Captain Barbossa, long believed to be dead, ha...",en,"[johnnydepp, orlandobloom, keiraknightley]",goreverbinski,"[headed, quite, long, believed, come, back, el..."
2,206647,Spectre,"[action, adventure, crime]","[spy, basedonnovel, secretagent, sequel, mi6]",A cryptic message from Bond’s past sends him o...,en,"[danielcraig, christophwaltz, léaseydoux]",sammendes,"[secret, service, alive, layers, bond, ’, unco..."
3,49026,The Dark Knight Rises,"[action, crime, drama, thriller]","[dccomics, crimefighter, terrorist, secretiden...",Following the death of District Attorney Harve...,en,"[christianbale, michaelcaine, garyoldman]",christophernolan,"[batman, assumes, responsibility, encounters, ..."
4,49529,John Carter,"[action, adventure, sciencefiction]","[basedonnovel, mars, medallion, spacetravel, p...","John Carter is a war-weary, former military ca...",en,"[taylorkitsch, lynncollins, samanthamorton]",andrewstanton,"[epic, conflict, former, military, captain, su..."


## 合併特徵 join()
### https://www.gushiciku.cn/pl/plIa/zh-tw


In [35]:
# 建立一個空的資料表
df_keys = pd.DataFrame()

In [36]:
df_keys['title'] = x['original_title']

In [37]:
df_keys.head(1)

Unnamed: 0,title
0,Avatar


In [38]:
df_keys['keywords'] = ''

In [39]:
df_keys.head(1)

Unnamed: 0,title,keywords
0,Avatar,


In [40]:
def bag_words(x):
    return(' '.join(x['genres']) + ' '+ ' '.join(x['keywords']) 
           + ' '+' '.join(x['cast'])+ ' '+' '.join(x['director'])
           +' '+' '.join(x['plotwords']))
df_keys['keywords'] = x.apply(bag_words, axis = 1)

In [41]:
df_keys.head(5)

Unnamed: 0,title,keywords
0,Avatar,action adventure fantasy sciencefiction cultur...
1,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drugabuse exoti...
2,Spectre,action adventure crime spy basedonnovel secret...
3,The Dark Knight Rises,action crime drama thriller dccomics crimefigh...
4,John Carter,action adventure sciencefiction basedonnovel m...


In [42]:
'''
 推薦模型只能讀取並比較向量，
 因此我們需要將 Bag_of_words 列使用 CountVectorizer 轉換為向量表示，
 這是一個簡單的頻率計數器。一旦我們得到了所有單詞的計數矩陣， 
 就可以利用 cosine_similarity 函式來比較電影的相似性。
'''

'\n 推薦模型只能讀取並比較向量，\n 因此我們需要將 Bag_of_words 列使用 CountVectorizer 轉換為向量表示，\n 這是一個簡單的頻率計數器。一旦我們得到了所有單詞的計數矩陣， \n 就可以利用 cosine_similarity 函式來比較電影的相似性。\n'

In [43]:
# create count matrix 
# 將文件的詞語轉換為詞頻矩陣
cv = CountVectorizer()

In [44]:
cv

CountVectorizer()

In [45]:
cv_mx = cv.fit_transform(df_keys['keywords'])

In [46]:
cv_mx

<4803x29762 sparse matrix of type '<class 'numpy.int64'>'
	with 178305 stored elements in Compressed Sparse Row format>

In [47]:
# create cosine similarity matrix
cosine_sim = cosine_similarity(cv_mx, cv_mx)

In [48]:
cosine_sim[0]

array([1.        , 0.10527936, 0.06299408, ..., 0.        , 0.        ,
       0.        ])

In [49]:
cosine_sim

array([[1.        , 0.10527936, 0.06299408, ..., 0.        , 0.        ,
        0.        ],
       [0.10527936, 1.        , 0.09284767, ..., 0.02438299, 0.        ,
        0.        ],
       [0.06299408, 0.09284767, 1.        , ..., 0.02188441, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02438299, 0.02188441, ..., 1.        , 0.05938557,
        0.03959038],
       [0.        , 0.        , 0.        , ..., 0.05938557, 1.        ,
        0.06818182],
       [0.        , 0.        , 0.        , ..., 0.03959038, 0.06818182,
        1.        ]])

In [50]:
# 把array轉換成dataframe
cosine_sim_df = pd.DataFrame(cosine_sim)
cosine_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4793,4794,4795,4796,4797,4798,4799,4800,4801,4802
0,1.000000,0.105279,0.062994,0.052414,0.116642,0.118056,0.024001,0.154303,0.070186,0.078621,...,0.000000,0.000000,0.023440,0.055728,0.000000,0.029514,0.000000,0.000000,0.000000,0.000000
1,0.105279,1.000000,0.092848,0.025751,0.085960,0.145004,0.023583,0.126350,0.068966,0.077254,...,0.000000,0.000000,0.023033,0.000000,0.000000,0.029001,0.000000,0.024383,0.000000,0.000000
2,0.062994,0.092848,1.000000,0.046225,0.077152,0.052058,0.021167,0.136083,0.061898,0.046225,...,0.034752,0.000000,0.000000,0.000000,0.022076,0.052058,0.000000,0.021884,0.000000,0.000000
3,0.052414,0.025751,0.046225,1.000000,0.042796,0.043315,0.052835,0.056614,0.025751,0.115385,...,0.028916,0.025751,0.017201,0.040893,0.018368,0.064972,0.000000,0.054627,0.041812,0.062718
4,0.116642,0.085960,0.077152,0.042796,1.000000,0.048196,0.039193,0.146986,0.028653,0.085592,...,0.032174,0.000000,0.000000,0.022751,0.000000,0.024098,0.000000,0.020261,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,0.029514,0.029001,0.052058,0.064972,0.024098,0.024390,0.039668,0.063758,0.058001,0.043315,...,0.000000,0.029001,0.000000,0.023027,0.041371,1.000000,0.000000,0.000000,0.023544,0.000000
4799,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.053606,0.000000,0.000000,0.000000,0.000000,1.000000,0.075810,0.043519,0.000000
4800,0.000000,0.024383,0.021884,0.054627,0.020261,0.020507,0.000000,0.000000,0.048766,0.036418,...,0.054759,0.024383,0.032573,0.038720,0.000000,0.000000,0.075810,1.000000,0.059386,0.039590
4801,0.000000,0.000000,0.000000,0.041812,0.000000,0.023544,0.019146,0.000000,0.055989,0.020906,...,0.000000,0.000000,0.000000,0.022228,0.000000,0.023544,0.043519,0.059386,1.000000,0.068182


## 三. recommend_movie

In [87]:
def recommend_movie(title,n = 10,cosine_sim = cosine_sim):
    movies = []
    if title not in indices.index:
        print('the movie not in our database')
        return
    else:
        idx = indices[title] #84
        
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_n_idx = list(scores.iloc[1:n].index)        
    return df_keys['title'].iloc[top_n_idx]

## 拆解recommend_movie

In [91]:
# n 預設是10部
# 找出Ocean's Twelve的前5部最相似電影
recommend_movie("Ocean's Twelve", n = 5)

433               RED 2
388      Ocean's Eleven
519      Now You See Me
398    Ocean's Thirteen
Name: title, dtype: object

In [51]:
# create list of indices for later matching
indices = pd.Series(df_keys.index, index = df_keys['title'])
indices

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [52]:
name = 'Avatar'
if name not in indices.index:
    print('movie not in database')
else:
    i = indices[name]
    print(i)

0


In [53]:
scores = pd.Series(cosine_sim[i]).sort_values(ascending = False)

In [54]:
scores

0       1.000000
4401    0.218218
466     0.210559
495     0.188982
2995    0.188982
          ...   
2459    0.000000
2460    0.000000
2461    0.000000
2462    0.000000
4802    0.000000
Length: 4803, dtype: float64

In [55]:
# list(scores.iloc[0:5].index)

In [56]:
# 第0個是自己 所以從1開始取
top_n_idx = list(scores.iloc[1:5].index)
# df_keys['title'].iloc[top_n_idx] # top_n_idx是一個陣列

In [57]:
df_keys['title'].iloc[top_n_idx]

4401                 The Helix... Loaded
466                     The Time Machine
495     Journey 2: The Mysterious Island
2995          Mad Max Beyond Thunderdome
Name: title, dtype: object