In [34]:
import numpy as np
import pandas as pd
import requests, zipfile, io, os
import json, glob, re, tmdb
from importlib import reload
from utils import create_directory, update_progress
import time

In [2]:
tmdb_api_key = "a67ce78b9ac0e0f7df8c2fc16c2fd30a"

In [3]:
zip_file_url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"

In [4]:
print("Downloading movielens data...")
r = requests.get(zip_file_url, stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
print("Done.")

Downloading movielens data...
Done.


In [5]:
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

In [6]:
users.shape

(943, 5)

In [7]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [8]:
genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", "Horror",
    "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western"
]
items_cols = ['movie_id', 'title', 'release_date', "video_release_date", "imdb_url"] + genre_cols
items_raw = pd.read_csv('ml-100k/u.item', sep='|', names=items_cols, encoding='latin-1')

In [9]:
items_raw.shape

(1682, 24)

In [10]:
items_raw.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,genre_unknown,Action,Adventure,Animation,Children,...,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

In [12]:
ratings.shape

(100000, 4)

In [13]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [54]:
create_directory("/data")

Directory already exists d:\Work\Development\recommendation_system/data


In [15]:
users.to_csv("data/users.csv", index=None)
items_raw.to_csv("data/items_raw.csv", index=None)
ratings.to_csv("data/ratings.csv", index=False)

In [None]:
links = pd.read_csv("data/links.csv", dtype={"tmdb_id": 'Int64' })
links.head()

Unnamed: 0,movie_id,imdb_id,tmdb_id
0,1,tt0114709,862
1,2,tt0113189,710
2,3,tt0113101,5
3,4,tt0113161,8012
4,5,tt0112722,1710


In [17]:
links.shape

(1682, 3)

In [22]:
sample_id = links.tmdb_id[1]

In [None]:
movie_response = requests.get(f"https://api.themoviedb.org/3/movie/{sample_id}?api_key={tmdb_api_key}")
movie_response.text

'{"adult":false,"backdrop_path":"/vm7yMg7YiYisMSeO3mUC5NUKbSB.jpg","belongs_to_collection":{"id":645,"name":"James Bond Collection","poster_path":"/ofwSiqOFShhunAIYYdSMHMJQSx2.jpg","backdrop_path":"/dOSECZImeyZldoq0ObieBE0lwie.jpg"},"budget":60000000,"genres":[{"id":12,"name":"Adventure"},{"id":28,"name":"Action"},{"id":53,"name":"Thriller"}],"homepage":"https://mgm.com/movies/goldeneye","id":710,"imdb_id":"tt0113189","origin_country":["GB"],"original_language":"en","original_title":"GoldenEye","overview":"When a powerful satellite system falls into the hands of Alec Trevelyan, AKA Agent 006, a former ally-turned-enemy, only James Bond can save the world from a dangerous space weapon that -- in one short pulse -- could destroy the earth! As Bond squares off against his former compatriot, he also battles Xenia Onatopp, an assassin who uses pleasure as her ultimate weapon.","popularity":7.3544,"poster_path":"/z0ljRnNxIO7CRBhLEO0DvLgAFPR.jpg","production_companies":[{"id":7576,"logo_path"

In [24]:
tmdb_ids = links.tmdb_id.tolist()

In [None]:
def get_movie_features(tmdb_ids):
    num = len(tmdb_ids)
    tick = 0
    features = []
    for i in tmdb_ids:
        # feature = tmdb.movie(i)
        feature = requests.get(f"https://api.themoviedb.org/3/movie/{i}?api_key={tmdb_api_key}")

        if feature != False: 
            features.append(feature.text)
        
        tick = tick + 1
        update_progress(tick / num)
    
    update_progress(1)
    return features

In [47]:
def save_movie_features(features):
    outfile = "data/features.json"
    with open(outfile, 'w') as fout:
            json.dump(features, fout)

In [48]:
features = get_movie_features(tmdb_ids)
save_movie_features(features)

Progress: [----------------------------------------] 0.1%


SSLError: HTTPSConnectionPool(host='api.themoviedb.org', port=443): Max retries exceeded with url: /3/movie/710?api_key=a67ce78b9ac0e0f7df8c2fc16c2fd30a (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)')))

In [50]:
len(features)

1682

In [51]:
text_features = []

for i in features:
  text_features.append(i.text)

In [52]:
text_features[0]

'{"adult":false,"backdrop_path":"/3Rfvhy1Nl6sSGJwyjb0QiZzZYlB.jpg","belongs_to_collection":{"id":10194,"name":"Toy Story Collection","poster_path":"/rki5qLuwb0xnnE9seehxO9TlLhW.jpg","backdrop_path":"/hApclyB9NEZEQujAVajzi5iWE4a.jpg"},"budget":30000000,"genres":[{"id":16,"name":"Animation"},{"id":12,"name":"Adventure"},{"id":10751,"name":"Family"},{"id":35,"name":"Comedy"}],"homepage":"http://toystory.disney.com/toy-story","id":862,"imdb_id":"tt0114709","origin_country":["US"],"original_language":"en","original_title":"Toy Story","overview":"Led by Woody, Andy\'s toys live happily in his room until Andy\'s birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy\'s heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.","popularity":31.5411,"poster_path":"/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg","production_companies":[{"id":3,"logo_path":"/1TjvGVDMYsj6JBxOAkUHpPEwLf7.p

In [53]:
save_movie_features(text_features)