In [1]:
import os
import csv
import time
import shutil
import requests
import pandas as pd
from splinter import Browser

In [2]:
#dataset with 2+ million rows (reviews) for 8000 movies
movie_filtered = pd.read_csv("resources/movie_filtered.csv")
movieIds_small = movie_filtered["movieId"].drop_duplicates()

In [63]:
#dataset with 50,000+ movie ids
movies = pd.read_csv("resources/links.csv")
movies.dropna(inplace=True)

#previews dataframe
movies.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [64]:
#dataframe with 8000 movieIds and their respective imdbIds and tmdbIds
movieIds_small_df = pd.merge(movieIds_small, movies, on="movieId")

#previews dataframe
movieIds_small_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,307,108394,108.0
1,481,107302,10909.0
2,1091,98627,8491.0
3,1257,88794,13667.0
4,1449,118111,16448.0


In [None]:
#posters already scraped (i.e. a list of file names currently in "images" folder)
saved_posters = os.listdir("static/images")
saved_posters = [int(poster.replace(".jpg", "")) for poster in saved_posters]
saved_posters.sort()

#previews dataframe
saved_posters

In [60]:
#dataframe featuring only Ids (i.e. file names) not found in "images" folder
missing_posters = movieIds_small_df.loc[~movieIds_small_df["movieId"].isin(saved_posters)]

#previews dataframe
missing_posters

Unnamed: 0,movieId,imdbId,tmdbId
1889,26614,94791,8677.0
2174,77854,384700,37106.0
7683,53883,430484,49870.0
7755,185135,1665071,500609.0


In [7]:
#activates google chrome for automation
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=True)

In [8]:
def tmdb_scrape(movie_df, start):
    start_scrape = time.time()
    
    for index, movie in movie_df[start:].iterrows():

        start_download = time.time()
        movieId = int(movie["movieId"])
        tmdbId = int(movie["tmdbId"])
        url = f'https://www.themoviedb.org/movie/{tmdbId}/images/posters?language=en-US'
        browser.visit(url)

        try:
            if(browser.is_element_present_by_tag("li", wait_time=2)):

                title = browser.title.replace(" - Posters — The Movie Database (TMDb)", "")
                link = browser.find_link_by_partial_href("https://image.tmdb.org/t/p/original/").first._element.get_attribute('href')

                #downloads poster
                with open("static/images/" + str(movieId) + ".jpg", "wb") as f:
                    f.write(requests.get(link).content)

                download_completed = time.time() - start_download
                print("_____________________________________________________")
                print(movieId, title, tmdbId)
                print(link)
                print(f'Movie poster downloaded in {download_completed} seconds.')

            else:
                print(movieId, '- tag not found')
                with open('resources/missing_posters.csv', mode='a') as log:
                    writer = csv.writer(log)
                    writer.writerow([movieId, 'tag not found'])
                log.close()
                continue

        except Exception as e:
            print(movieId, e)
            with open('resources/missing_posters.csv', mode='a') as log:
                writer = csv.writer(log)
                writer.writerow([movieId, e])
            log.close()
            continue

    scrape_time = time.time() - start_scrape
    print(f'Scrape completed in {scrape_time} seconds.')

In [57]:
def imdb_scrape(movie_df, start):
    start_scrape = time.time()

    for index, movie in movie_df[start:].iterrows():
        start_time = time.time()

        movieId = int(movie["movieId"])
        imdbId = int(movie["imdbId"])
        # add or subtract "0" from "/tt00" to alter results
        url = "https://www.imdb.com/title/tt00" + str(imdbId) + "/mediaindex?refine=poster&ref_=ttmi_ref_pos"

        try:
            browser.visit(url)
            link = browser.find_link_by_partial_href(str(imdbId) + "/mediaviewer")
            url = link.first._element.get_attribute('href')
            browser.visit(url)
    
            title = browser.title
            link = browser.find_by_css('img')[2]['src']

            #downloads poster
            with open("static/images/" + str(movieId) + ".jpg", "wb") as f:
                f.write(requests.get(link).content)

            print("_____________________________________________________")
            print(movieId, title, imdbId)
            print(link)

            elapsed_time = time.time() - start_time
            print(f'Movie poster downloaded in {elapsed_time} seconds.')

        except Exception as e:
            print(f'Movie poster for movie: {movieId} failed to download.', e)
            continue

    scrape_time = time.time() - start_scrape
    print(f'Scrape completed in {scrape_time} seconds.')

In [None]:
def missing_titles(movie_df, start):
    titles = []
    
    for index, movie in movie_df[start:].iterrows():
        start_time = time.time()

        movieId = int(movie["movieId"])
        imdbId = int(movie["imdbId"])
        url = "https://www.imdb.com/title/tt00" + str(imdbId) + "/mediaindex?refine=poster&ref_=ttmi_ref_pos"


        browser.visit(url)
        title = browser.title
        titles.append(title)

        print("_____________________________________________________")
        print(movieId, title, imdbId)
        
    return titles

In [61]:
missing_titles(missing_posters, 0)

_____________________________________________________
26614 The Bourne Identity (TV Mini-Series 1988) - Photo Gallery - IMDb 94791
_____________________________________________________
77854 The Work of Director Michel Gondry (Video 2003) - Photo Gallery - IMDb 384700
_____________________________________________________
53883 The Power of Nightmares: The Rise of the Politics of Fear (TV Mini-Series 2004) - Photo Gallery - IMDb 430484
_____________________________________________________
185135 "Sherlock" A Study in Pink (TV Episode 2010) - Photo Gallery - IMDb 1665071


['The Bourne Identity (TV Mini-Series 1988) - Photo Gallery - IMDb',
 'The Work of Director Michel Gondry (Video 2003) - Photo Gallery - IMDb',
 'The Power of Nightmares: The Rise of the Politics of Fear (TV Mini-Series 2004) - Photo Gallery - IMDb',
 '"Sherlock" A Study in Pink (TV Episode 2010) - Photo Gallery - IMDb']

In [77]:
def copy_posters():
    for index, movie in movieIds_small_df.iterrows():

        movieId = int(movie["movieId"])
        poster = str(movieId) + ".jpg"
        file1 = "static/images/" + poster
        file2 = "static/images2/" + poster
        
        #print(f'copied {file1} to {file2}')
        
        try:
            shutil.copy2(file1, file2)
        except Exception as e:
            print(e)
            continue
        

In [78]:
copy_posters()

[Errno 2] No such file or directory: 'static/images/26614.jpg'
[Errno 2] No such file or directory: 'static/images/77854.jpg'
[Errno 2] No such file or directory: 'static/images/53883.jpg'
[Errno 2] No such file or directory: 'static/images/185135.jpg'


In [None]:
#uncomment following code to create csv for adam.
'''
for_adam = movieIds_small_df.copy().astype({"tmdbId": int})
for_adam.sort_values(by=['movieId'], inplace=True)
for_adam = for_adam.astype(str)
for_adam.to_csv("filtered_links.csv")
for_adam

'''