In [20]:
import gzip
import json
import pandas as pd
import requests
from io import BytesIO
#抓取今年的file
def make_category_id_url_suffix(category, extension='json'):
    year = str(pd.datetime.today().year)
    month = str(pd.datetime.today().month).zfill(2)
    day = str(pd.datetime.today().day - 1).zfill(2)
    return '_'.join([category, 'ids', month, day, year]) + '.' + extension

#Download ID list
def download_id_list_as_csv(category):
    
    print(f'Downloading list of ids for {category}')
    id_list_name = make_category_id_url_suffix(category)
    ID_LISTS_RAW_URL = 'http://files.tmdb.org/p/exports/{0}.gz'.format(id_list_name)
    with gzip.open(BytesIO(requests.get(ID_LISTS_RAW_URL).content), 'r') as f_open:
        id_list = f_open.readlines()
    # original 'json' is malformed, is actually one dict per line
    ids = pd.DataFrame([json.loads(x) for x in id_list])
    # some entries in the movie id list appear to be collections rather than movies
    if 'original_title' in ids.columns:
        ids.original_title = ids.original_title.apply(str)
        ids = ids[~ids.original_title.str.endswith(' Collection')].copy()
    # You have to drop adult films if you want to post any new data to Kaggle.
    if 'adult' in ids.columns:
        ids = ids[~ids['adult']].copy()
    ids.to_csv(category + '_ids.csv', index=False)

In [21]:
download_id_list_as_csv('movie')

Downloading list of ids for movie


  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [22]:
import requests
import pandas as pd
import re
import json
import csv
#Read all tmdbID
df = pd.read_csv('movie_ids.csv')

with open("movie.csv",mode="w") as fi:
    writer = csv.writer(fi, delimiter=',')
    writer.writerow(['index','movie_id', 'movie_name', 'genres','overview','original_language','cast','crew'])
    num = 0
    #Write Data to csv
    for value in df['id']:
        #for i,value in enumerate(df['id'])
        if num >200:
            break
        url = 'https://api.themoviedb.org/3/movie/'+str(value)+'?api_key=6dfbbbfc10aa0e69930a9f512c59b66d&language=zh-TW&append_to_response=credits,keywords'
        r = requests.get(url)
        response = r.text
        data = json.loads(response)
        #Select zh-TW Movie Data
        if not '\u4e00' <= data['title'] <= '\u9fa5':
            continue
        else:
            num = num +1
            writer.writerow([num,data['id'], data['title'], data['genres'],re.sub('\s+','',data['overview']),data['original_language'],data['credits']['cast'],data['credits']['crew']])


In [23]:
import requests
import pandas as pd
import re
import json
import csv
df1 = pd.read_csv(u'movie.csv')
df1.head()

Unnamed: 0,index,movie_id,movie_name,genres,overview,original_language,cast,crew
0,1,3,天堂孤影,"[{'id': 18, 'name': '剧情'}, {'id': 35, 'name': ...",環衛公司垃圾車駕駛員尼卡德（MattiPellonp鳵飾）的老同事計劃籌資組建自己的環衛公司...,fi,"[{'adult': False, 'gender': 2, 'id': 4826, 'kn...","[{'adult': False, 'gender': 2, 'id': 16767, 'k..."
1,2,11,星際大戰四部曲：曙光乍現,"[{'id': 12, 'name': '冒险'}, {'id': 28, 'name': ...",遙遠星係發生叛亂，銀河共和國被推翻，奧爾德蘭星的莉亞公主（卡里•費甚爾CarrieFishe...,en,"[{'adult': False, 'gender': 2, 'id': 2, 'known...","[{'adult': False, 'gender': 2, 'id': 1, 'known..."
2,3,12,海底總動員,"[{'id': 16, 'name': '动画'}, {'id': 10751, 'name...",小丑魚馬林（Marlin）與配偶珊瑚（Coral）於大堡礁建立了家園，但一場梭子魚的攻擊使馬...,en,"[{'adult': False, 'gender': 2, 'id': 13, 'know...","[{'adult': False, 'gender': 2, 'id': 7, 'known..."
3,4,13,阿甘正傳,"[{'id': 35, 'name': '喜剧'}, {'id': 18, 'name': ...",二次大戰剛結束，阿甘出生在美國阿拉巴馬州的一個閉塞小鎮，他先天弱智，但上帝又賜予他一雙疾步如...,en,"[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 37, 'know..."
4,5,14,美國心玫瑰情,"[{'id': 18, 'name': '剧情'}]",故事的主人翁賴斯特罕住在一個典型的美國小鎮裡，他面臨中年危機，突然對自己的生活感到不滿：和太...,en,"[{'adult': False, 'gender': 2, 'id': 1979, 'kn...","[{'adult': False, 'gender': 2, 'id': 153, 'kno..."


In [24]:
# Extract list of genres
from ast import literal_eval

df1['genres'] = df1['genres'].apply(literal_eval)
df1['genres'].head()

0    [{'id': 18, 'name': '剧情'}, {'id': 35, 'name': ...
1    [{'id': 12, 'name': '冒险'}, {'id': 28, 'name': ...
2    [{'id': 16, 'name': '动画'}, {'id': 10751, 'name...
3    [{'id': 35, 'name': '喜剧'}, {'id': 18, 'name': ...
4                           [{'id': 18, 'name': '剧情'}]
Name: genres, dtype: object

In [25]:
from opencc import OpenCC

def list_genres(x):
    cc = OpenCC('s2tw')
    l = [cc.convert(d['name']) for d in x]
    return(l)
df1['genres'] = df1['genres'].apply(list_genres)

df1['genres'].head()


0        [劇情, 喜劇]
1    [冒險, 動作, 科幻]
2        [動畫, 家庭]
3    [喜劇, 劇情, 愛情]
4            [劇情]
Name: genres, dtype: object

In [26]:
# Extract top 3 cast members
#from translate import Translator
df1['cast'] = df1['cast'].apply(literal_eval)
#translator= Translator(to_lang="chinese")

def list_cast(x):
    l = [d['name'] for d in x]
    if len(l) > 3:
        l = l[:1]
    return(l)
df1['cast'] = df1['cast'].apply(list_cast)
#df1['cast'].head()

In [27]:
# Extract director
import numpy as np
df1['crew'] = df1['crew'].apply(literal_eval)

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
df1['director'] = df1['crew'].apply(get_director)


# Drop the now unnecessary crew feature
df1 = df1.drop('crew', axis = 1)


In [28]:
missing = df1.columns[df1.isnull().any()]
df1[missing].isnull().sum().to_frame()

# Replace NaN from overview with an empty string
df1['overview'] = df1['overview'].fillna('')

In [29]:
import jieba
import jieba.analyse
#split overview
#print(df1['overview'])
jieba.set_dictionary('./dict.txt')
overviews = []
for d in df1['overview'].astype(str):
    keywords = jieba.analyse.extract_tags(d, topK=6)
    overviews.append(keywords)

df1['keyword'] = overviews

Building prefix dict from /Users/zhangxinyu/nlp-pro/dict.txt ...
Dumping model to file cache /var/folders/g9/__yhhv7d0xx59_jssnmpjhwh0000gn/T/jieba.ufb30e0fe4a3317c83cabe51d90c31569.cache
Loading model cost 0.751 seconds.
Prefix dict has been built successfully.


In [30]:
import re

df1['feature'] = ''
def bag_words(x):
    return (''.join(re.sub('[^\u4e00-\u9fa5]+', '', x['movie_name'])) + ' ' + ' '.join(x['genres']) + ' ' +  ' '.join(x['keyword'])+ ' ' +  ' '.join(x['cast'])+ ' ' +  ''.join(x['director']))
df1['feature'] = df1.apply(bag_words, axis = 1)

#df1['feature'].head()

feature = df1[['movie_id','movie_name','genres','cast','director','feature']]
feature.to_csv('feature.csv',index=False)


In [31]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
# 將文件中的詞語轉換為詞頻矩陣
cv = CountVectorizer()
# 計算個詞語出現的次數
cv_mx = cv.fit_transform(df1['feature'])

# create cosine similarity matrix
cosine_sim = cosine_similarity(cv_mx, cv_mx)
print(cosine_sim)

[[1.         0.         0.         ... 0.08006408 0.07692308 0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.08006408 0.         0.         ... 1.         0.08006408 0.        ]
 [0.07692308 0.         0.         ... 0.08006408 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [32]:
# create list of indices for later matching
indices = pd.Series(df1.index, index = df1['movie_name'])

In [33]:
def recommend_movie(title, n = 10, cosine_sim = cosine_sim):
    movies = []
    
    # 檢索匹配的 movie_name index
    if title not in indices.index:
        print("Movie not in database.")
        return
    else:
        idx = indices[title]
    
    # 電影的餘弦相似度分數降序排列
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    # 前 n 個最相似的 movies indexes
    # 使用 1:n 因為 0 是輸入的同一部電影
    top_n_idx = list(scores.iloc[1:n].index)
        
    #return result
    print(df1['movie_name'].iloc[top_n_idx])
    #ans = df1['movie_name'].iloc[top_n_idx]
    #ans.to_csv('result.csv',index = False)

In [34]:
recommend_movie('星艦奇航記',8)

95      星艦奇航記5：終極先鋒
83     星艦奇航記2：星戰大怒吼
92      星艦奇航記4：搶救未來
96      星艦奇航記6：邁入未來
85      星艦奇航記3：石破天驚
108     星艦奇航記7：日換星移
114     星艦奇航記9：星際叛變
Name: movie_name, dtype: object
