In [25]:
import sys, os
import urllib.request
import pandas as pd
import numpy as np
import tmdbsimple as tmdb
from IPython.display import display, HTML, Image


# Step 1: Download rating data

## 1.1 Download rating data

In [2]:
if "data" not in os.listdir():
    os.mkdir("data")
    os.mkdir("data/posters")
    print("Create new folder to save data")

In [3]:
"""movie rating data: Only run it one time""" 
ratingDataUrl = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

In [4]:
os.listdir("data")

['.DS_Store', 'ml-latest-small.zip', 'ml-latest-small', 'poster']

In [5]:
if "ml-latest-small.zip" not in os.listdir("data"):
    urllib.request.urlretrieve(ratingDataUrl,"data/ml-latest-small.zip")
    os.system("unzip -a -n data/ml-latest-small.zip -d data/")

## 1.2 Read rating data

In [6]:
rating_df = pd.read_csv('data/ml-latest-small/ratings.csv')
linkes_df = pd.read_csv('data/ml-latest-small/links.csv')

In [7]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
linkes_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [9]:
df_merged = pd.merge(rating_df,linkes_df,on=['movieId'])
df_merged.dropna(how="any",inplace=True)
df_merged[['tmdbId']] = df_merged[['tmdbId']].astype(int)

# df_merged['tmdbId'] = [int(x) for x in df_merged.tmdbId]
print(df_merged.head())
print(df_merged.shape)

   userId  movieId  rating   timestamp  imdbId  tmdbId
0       1        1     4.0   964982703  114709     862
1       5        1     4.0   847434962  114709     862
2       7        1     4.5  1106635946  114709     862
3      15        1     2.5  1510577970  114709     862
4      17        1     4.5  1305696483  114709     862
(100823, 6)


In [31]:
df_merged.to_csv('data/rating.csv')

# Step 2: Download movie poster data <br>

#### Data resource link: https://developers.themoviedb.org/3/getting-started/images
##### Egs
- https://image.tmdb.org/t/p/original/wwemzKWzjKYJFfCeiB57q3r4Bcm.svg
- https://image.tmdb.org/t/p/original/wwemzKWzjKYJFfCeiB57q3r4Bcm.png
- https://image.tmdb.org/t/p/w500/wwemzKWzjKYJFfCeiB57q3r4Bcm.png

## 2.1 TMDB Config

In [10]:
tmdbBase = 'http://image.tmdb.org/t/p/w185/' 
key_v3 = '10f40f0d5d794e4bacb266188128a896'

## 2.1.1 TMDB API

In [11]:
id = 505

tmdb_connector = tmdb
tmdb_connector.API_KEY = key_v3

movieInfo = tmdb.Movies(id).info()
posterLink = movieInfo['poster_path']
fullLink = tmdbBase + posterLink

In [13]:
print(movieInfo)

{'adult': False, 'backdrop_path': '/v7baGyne7CsLxnM2maMwIxKZdVF.jpg', 'belongs_to_collection': None, 'budget': 20000000, 'genres': [{'id': 80, 'name': 'Crime'}, {'id': 18, 'name': 'Drama'}], 'homepage': None, 'id': 505, 'imdb_id': 'tt0097626', 'original_language': 'en', 'original_title': 'Johnny Handsome', 'overview': 'A career criminal who has been deformed since birth is given a new face by a kindly doctor and paroled from prison. It appears that he has gone straight, but he is really planning his revenge on the man who killed his mentor and sent him to prison.', 'popularity': 6.024, 'poster_path': '/fJBea43DpS6BhL9ZTBDoTNMiOG6.jpg', 'production_companies': [{'id': 276, 'logo_path': None, 'name': 'Guber/Peters Company', 'origin_country': ''}, {'id': 277, 'logo_path': None, 'name': 'Roven Productions', 'origin_country': ''}, {'id': 275, 'logo_path': '/2MxNWlN5b3UXt1OfmznsWEQzFwq.png', 'name': 'Carolco Pictures', 'origin_country': 'US'}], 'production_countries': [{'iso_3166_1': 'US', '

In [None]:
def get_poster_link(id,tmdb):
    count = 1
    while count < 3:
        try:
            tmdbBase = 'http://image.tmdb.org/t/p/w185/' 
            movieInfo = tmdb.Movies(id).info()
            posterLink = movieInfo['poster_path']
            fullLink = tmdbBase + posterLink
            return fullLink
        except:
            count += 1
    return None

In [14]:
fullLink

'http://image.tmdb.org/t/p/w185//fJBea43DpS6BhL9ZTBDoTNMiOG6.jpg'

In [15]:
get_poster_link(id=505,tmdb=tmdb)

'http://image.tmdb.org/t/p/w185//fJBea43DpS6BhL9ZTBDoTNMiOG6.jpg'

In [16]:
images = f"<img style='width: 100px; margin: 0px; float: left; border: 1px solid black;' src='{fullLink}' />" 

display(HTML(images))

## 2.2 Download poster data

### 2.2.1 Create API

In [17]:
urllib.request.urlretrieve(fullLink, "data/poster/{}.jpg".format(id))

('data/poster/505.jpg', <http.client.HTTPMessage at 0x113790630>)

In [18]:
def scrape_poster(id,tmdb,target_folder="data/poster/"):
    posterLink = get_poster_link(id,tmdb)
    if posterLink:
        urllib.request.urlretrieve(fullLink, f"{target_folder}{id}.jpg")
        print(f'Poster {id} successfully downloaded')
    else:
        print(f"Unable to scrape data for poster :{id}")

In [19]:
scrape_poster(505,tmdb)

Poster 505 successfully downloaded


### 2.2.2 Download all posters in df

In [20]:
TMDBIds = df_merged.tmdbId.unique()
print(len(TMDBIds))

9715


In [21]:
"""Loop version"""
links_loop = []
for Id in TMDBIds:
    tmp = scrape_poster(id=Id,tmdb=tmdb)
    if tmp:    
        links_loop.append(tmp)

In [22]:
"""Map version"""
links_map = list(map(lambda x:scrape_poster(id=x,tmdb=tmdb),TMDBIds))

'Map version'

In [23]:
"""Multiprocess version"""
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
workers = 6
with ProcessPoolExecutor(max_workers=workers) as executor:
    links_multiprocess = executor.map(scrape_poster, TMDBIds,[tmbd]*len(TMDBIds))        		

'Multiprocess version'

In [28]:
"""Check number of posters downloaded"""
poster_folder = os.listdir('data/poster/')
print(f"Total number of posters: {len(np.unique(poster_folder))}")
poster_folder[:5]

Total number of posters: 6209


['63.jpg', '823.jpg', '2666.jpg', '10447.jpg', '2100.jpg']

# Step 3: Modulize


In [None]:
class scraper:
    def __init__(self,ratingLink=None):
        tmdbBase = 'http://image.tmdb.org/t/p/w185/' 
        key_v3 = '10f40f0d5d794e4bacb266188128a896'
        
        tmdb_connector = tmdb
        tmdb_connector.API_KEY = key_v3
        self.tmdb = tmdb
        
        ratingLink = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip" if not ratingLink else ratingLink
        if "data" not in os.listdir():
            os.mkdir("data")
            os.mkdir("data/posters")
            print("Create new folder to save data")
        
    def download_rating(self):
        if "ml-latest-small.zip" not in os.listdir("data"):
            urllib.request.urlretrieve(self.ratingLink,"data/ml-latest-small.zip")
            os.system("unzip -a -n data/ml-latest-small.zip -d data/")
    
    def proces_rating(self):
        rating_df = pd.read_csv('data/ml-latest-small/ratings.csv')
        linkes_df = pd.read_csv('data/ml-latest-small/links.csv')
        df_merged = pd.merge(rating_df,linkes_df,on=['movieId'])
        df_merged.dropna(how="any",inplace=True)
        df_merged[['tmdbId']] = df_merged[['tmdbId']].astype(int)
        self.rating = df_merged
        return df_merged
    
    def download_posters(self,method="map",target_folder="data/poster/"):
        TMDBIds = self.rating.tmdbId.unique()
        
        if method == "map":
            list(map(lambda x:scrape_poster(id=x,target_folder=target_folder),TMDBIds))
        elif method == "multiprocess":
            workers = 6
            with ProcessPoolExecutor(max_workers=workers) as executor:
                links_multiprocess = executor.map(scrape_poster, TMDBIds)     
        elif method == 'loop':   
            for Id in TMDBIds:
                scrape_poster(id=Id)
        else:
            print("Must select a method")
            
    def get_poster_link(self,id):
        count = 1
        while count < 3:
            try:
                tmdbBase = 'http://image.tmdb.org/t/p/w185/' 
                movieInfo = self.tmdb.Movies(id).info()
                posterLink = movieInfo['poster_path']
                fullLink = tmdbBase + posterLink
                return fullLink
            except:
                count += 1
        return None
    def scrape_poster(self,id,target_folder="data/poster/"):
        posterLink = self.get_poster_link(id)
        if posterLink:
            urllib.request.urlretrieve(fullLink, f"{target_folder}{id}.jpg")
            print(f'Poster {id} successfully downloaded')
        else:
            print(f"Unable to scrape data for poster :{id}")