In [4]:
import gzip
import json
import pandas as pd
import requests
from io import BytesIO
#抓取今年的file
def make_category_id_url_suffix(category, extension='json'):
    year = str(pd.datetime.today().year)
    month = str(pd.datetime.today().month).zfill(2)
    day = str(pd.datetime.today().day - 1).zfill(2)
    return '_'.join([category, 'ids', month, day, year]) + '.' + extension

#Download ID list
def download_id_list_as_csv(category):
    
    print(f'Downloading list of ids for {category}')
    id_list_name = make_category_id_url_suffix(category)
    ID_LISTS_RAW_URL = 'http://files.tmdb.org/p/exports/{0}.gz'.format(id_list_name)
    with gzip.open(BytesIO(requests.get(ID_LISTS_RAW_URL).content), 'r') as f_open:
        id_list = f_open.readlines()
    # original 'json' is malformed, is actually one dict per line
    ids = pd.DataFrame([json.loads(x) for x in id_list])
    # some entries in the movie id list appear to be collections rather than movies
    if 'original_title' in ids.columns:
        ids.original_title = ids.original_title.apply(str)
        ids = ids[~ids.original_title.str.endswith(' Collection')].copy()
    # You have to drop adult films if you want to post any new data to Kaggle.
    if 'adult' in ids.columns:
        ids = ids[~ids['adult']].copy()
    ids.to_csv(category + '_ids.csv', index=False)

In [5]:
download_id_list_as_csv('movie')

Downloading list of ids for movie


  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [6]:
import requests
import pandas as pd
import re
import json
import csv

#Read all tmdbID
df = pd.read_csv('movie_ids.csv')

with open("movie.csv",mode="w") as fi:
    writer = csv.writer(fi, delimiter=',')
    writer.writerow(['index','movie_id', 'movie_name', 'genres','overview','original_language'])
    num = 1
    #Write Data to csv
    for value in df['id']:
        #for i,value in enumerate(df['id'])
        if num >500:
            break
        url = 'https://api.themoviedb.org/3/movie/'+str(value)+'?api_key=6dfbbbfc10aa0e69930a9f512c59b66d&language=zh-TW&append_to_response=credits,keywords'
        r = requests.get(url)
        response = r.text
        data = json.loads(response)
        
        #Select zh-TW Movie Data
        if not '\u4e00' <= data['title'] <= '\u9fa5':
            continue
        else:
            num = num +1
            writer.writerow([num,data['id'], data['title'], data['genres'],re.sub('\s+','',data['overview']),data['original_language']])


In [7]:
import requests
import pandas as pd
import re
import json
import csv
df1 = pd.read_csv(u'movie.csv')
df1

Unnamed: 0,index,movie_id,movie_name,genres,overview,original_language
0,2,3,天堂孤影,"[{'id': 18, 'name': '剧情'}, {'id': 35, 'name': ...",環衛公司垃圾車駕駛員尼卡德（MattiPellonp鳵飾）的老同事計劃籌資組建自己的環衛公司...,fi
1,3,11,星際大戰四部曲：曙光乍現,"[{'id': 12, 'name': '冒险'}, {'id': 28, 'name': ...",遙遠星係發生叛亂，銀河共和國被推翻，奧爾德蘭星的莉亞公主（卡里•費甚爾CarrieFishe...,en
2,4,12,海底總動員,"[{'id': 16, 'name': '动画'}, {'id': 10751, 'name...",小丑魚馬林（Marlin）與配偶珊瑚（Coral）於大堡礁建立了家園，但一場梭子魚的攻擊使馬...,en
3,5,13,阿甘正傳,"[{'id': 35, 'name': '喜剧'}, {'id': 18, 'name': ...",二次大戰剛結束，阿甘出生在美國阿拉巴馬州的一個閉塞小鎮，他先天弱智，但上帝又賜予他一雙疾步如...,en
4,6,14,美國心玫瑰情,"[{'id': 18, 'name': '剧情'}]",故事的主人翁賴斯特罕住在一個典型的美國小鎮裡，他面臨中年危機，突然對自己的生活感到不滿：和太...,en
...,...,...,...,...,...,...
495,497,956,不可能的任務3,"[{'id': 12, 'name': '冒险'}, {'id': 28, 'name': ...",秘密特工伊森（湯姆·克魯斯TomCruise飾）隱藏了真實身份，准備和女友安心生活。這時，他...,en
496,498,957,星際歪傳,"[{'id': 35, 'name': '喜剧'}, {'id': 878, 'name':...",在很久以前的遙遠星系中，「炮彈一族」因其領袖的愚蠢，耗盡了自己星球的大氣，於是計劃奪取鄰近的...,en
497,499,961,將軍號,"[{'id': 28, 'name': '动作'}, {'id': 12, 'name': ...",故事發生在美國南北戰爭時期，火車司機約翰尼·格雷（巴斯特·基頓飾演）有兩大最愛：一個是他的女...,en
498,500,962,淘金熱,"[{'id': 12, 'name': '冒险'}, {'id': 35, 'name': ...",流浪漢查理（查理·卓別林飾）隨隊伍來到阿拉斯加淘金。他在一間小木屋裡碰見了通緝犯拉遜，接著淘...,en


In [8]:
# Extract list of genres
from ast import literal_eval

df1['genres'] = df1['genres'].apply(literal_eval)
df1['genres']

0      [{'id': 18, 'name': '剧情'}, {'id': 35, 'name': ...
1      [{'id': 12, 'name': '冒险'}, {'id': 28, 'name': ...
2      [{'id': 16, 'name': '动画'}, {'id': 10751, 'name...
3      [{'id': 35, 'name': '喜剧'}, {'id': 18, 'name': ...
4                             [{'id': 18, 'name': '剧情'}]
                             ...                        
495    [{'id': 12, 'name': '冒险'}, {'id': 28, 'name': ...
496    [{'id': 35, 'name': '喜剧'}, {'id': 878, 'name':...
497    [{'id': 28, 'name': '动作'}, {'id': 12, 'name': ...
498    [{'id': 12, 'name': '冒险'}, {'id': 35, 'name': ...
499    [{'id': 9648, 'name': '悬疑'}, {'id': 80, 'name'...
Name: genres, Length: 500, dtype: object

In [9]:
from opencc import OpenCC

def list_genres(x):
    cc = OpenCC('s2tw')
    l = [cc.convert(d['name']) for d in x]
    return(l)
df1['genres'] = df1['genres'].apply(list_genres)

df1['genres']


0                  [劇情, 喜劇]
1              [冒險, 動作, 科幻]
2                  [動畫, 家庭]
3              [喜劇, 劇情, 愛情]
4                      [劇情]
               ...         
495            [冒險, 動作, 驚悚]
496                [喜劇, 科幻]
497    [動作, 冒險, 喜劇, 劇情, 戰爭]
498            [冒險, 喜劇, 劇情]
499            [懸疑, 犯罪, 驚悚]
Name: genres, Length: 500, dtype: object

In [10]:
missing = df1.columns[df1.isnull().any()]
df1[missing].isnull().sum().to_frame()

# Replace NaN from overview with an empty string
df1['overview'] = df1['overview'].fillna('')

In [11]:
import jieba
import jieba.analyse
#split overview
#print(df1['overview'])
overviews = []
for d in df1['overview'].astype(str):
    keywords = jieba.analyse.extract_tags(d, topK=10)
    overviews.append(keywords)

df1['keyword'] = overviews

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/g9/__yhhv7d0xx59_jssnmpjhwh0000gn/T/jieba.cache
Loading model cost 0.717 seconds.
Prefix dict has been built successfully.


In [12]:
import re

df1['feature'] = ''
def bag_words(x):
    return (''.join(re.sub('[^\u4e00-\u9fa5]+', '', x['movie_name'])) + ' ' + ' '.join(x['genres']) + ' ' +  ' '.join(x['keyword']))
df1['feature'] = df1.apply(bag_words, axis = 1)

df1['feature'].head()

feature = df1[['movie_id','movie_name','feature']]
feature.to_csv('feature.csv',index=False)


In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
# 將文件中的詞語轉換為詞頻矩陣
cv = CountVectorizer()
# 計算個詞語出現的次數
cv_mx = cv.fit_transform(df1['feature'])

# create cosine similarity matrix
cosine_sim = cosine_similarity(cv_mx, cv_mx)
print(cosine_sim)

[[1.         0.         0.         ... 0.12403473 0.14824986 0.        ]
 [0.         1.         0.         ... 0.11952286 0.07142857 0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.12403473 0.11952286 0.         ... 1.         0.17928429 0.05976143]
 [0.14824986 0.07142857 0.         ... 0.17928429 1.         0.        ]
 [0.         0.         0.         ... 0.05976143 0.         1.        ]]


In [14]:
# create list of indices for later matching
indices = pd.Series(df1.index, index = df1['movie_name'])

In [15]:
def recommend_movie(title, n = 10, cosine_sim = cosine_sim):
    movies = []
    
    # 檢索匹配的 movie_name index
    if title not in indices.index:
        print("Movie not in database.")
        return
    else:
        idx = indices[title]
    
    # 電影的餘弦相似度分數降序排列
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    # 前 n 個最相似的 movies indexes
    # 使用 1:n 因為 0 是輸入的同一部電影
    top_n_idx = list(scores.iloc[1:n].index)
        
    #return result
    print(df1['movie_name'].iloc[top_n_idx])
    #ans = df1['movie_name'].iloc[top_n_idx]
    #ans.to_csv('result.csv',index = False)

In [16]:
recommend_movie('玩具總動員',5)

447    玩具總動員2
490    冰原歷險記2
492     馬達加斯加
220     冰原歷險記
Name: movie_name, dtype: object
