In [1]:
# Import libaries
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from tqdm import tqdm

In [2]:
# Read in API key for TMDB

with open('../ignored/tmdb_api_key.txt', 'r') as file:
    my_api_key = file.read().rstrip()

# Technique reference: https://stackoverflow.com/questions/8369219/how-to-read-a-text-file-into-a-string-variable-and-strip-newlines

### Read in table with titles to be scraped

In [3]:
data_directory = "../data/"

In [4]:
no_dupes = pd.read_csv(f'{data_directory}imdb_titles_no_dupes.csv')
no_dupes.shape

(2484, 9)

In [5]:
no_dupes.head()

Unnamed: 0,name,href,years,imdb_description,pg_rating,imdb_genre_tags,imdb_rating,num_votes,img_thumbnail
0,Game of Thrones,/title/tt0944947/,(2011–2019),Nine noble families fight for control over the...,TV-MA,"Action, Adventure, Drama",9.2,2148311,https://m.media-amazon.com/images/M/MV5BYTRiND...
1,Prison Break,/title/tt0455275/,(2005–2017),"Due to a political conspiracy, an innocent man...",TV-14,"Action, Crime, Drama",8.3,548267,https://m.media-amazon.com/images/M/MV5BMTg3NT...
2,Vikings,/title/tt2306299/,(2013–2020),Vikings transports us to the brutal and myster...,TV-MA,"Action, Adventure, Drama",8.5,547494,https://m.media-amazon.com/images/M/MV5BODk4Zj...
3,The Boys,/title/tt1190634/,(2019– ),A group of vigilantes set out to take down cor...,TV-MA,"Action, Comedy, Crime",8.7,542317,https://m.media-amazon.com/images/M/MV5BOTEyND...
4,The Mandalorian,/title/tt8111088/,(2019– ),The travels of a lone bounty hunter in the out...,TV-14,"Action, Adventure, Fantasy",8.7,527088,https://m.media-amazon.com/images/M/MV5BZjRlZD...


### Define functions for our API call

In [6]:
def tmdb_find(external_id, api_key=my_api_key):

    base_url = 'https://api.themoviedb.org/3/find/'
    url = f"{base_url}{external_id}?api_key={my_api_key}"

    params = {
            'language': 'en-US',
            'external_source': 'imdb_id'
        }
    
    response = requests.get(url, params=params)
    data = response.json()['tv_results'][0]

    data_to_keep = {
        'tmdb_id' : int(data['id']),
        'tmdb_name' : data['name'],
        'original_name' : data['original_name'],
        'original_language' : data['original_language'],        
        'origin_country' : data['origin_country'],
        'tmdb_popularity' : data['popularity'],
        'tmdb_vote_average' : data['vote_average'],
        'tmdb_vote_count' : int(data['vote_count']),
        'first_air_date' : data['first_air_date'],
        'tmdb_adult_content' : data['adult'],
        'tmdb_poster_path' : data['poster_path'],
        'tmdb_overview' : data['overview'].encode("ascii", "ignore").decode(), 
                            # Technique reference: https://pythonguides.com/remove-unicode-characters-in-python/
    }

    return data_to_keep

# Reference: https://developers.themoviedb.org/3/find/find-by-id

In [7]:
# query for other info not obtained from "find" search for tv show

def tmdb_tv_query(tv_id, api_key=my_api_key):

    base_url = 'https://api.themoviedb.org/3/tv/'
    url = f"{base_url}{tv_id}?api_key={my_api_key}"

    params = {
            'language': 'en-US',
        }
    
    response = requests.get(url, params=params)
    data = response.json()

    data_to_keep = {
        'tmdb_tagline' : data['tagline'],
        'tmdb_genres' : ", ".join( [i['name'] for i in data['genres']] ),
        'tv_networks' : ", ".join( [i['name'] for i in data['networks']] ),
    }

    return data_to_keep

# Reference: https://developers.themoviedb.org/3/tv/get-tv-details

In [8]:
def scrape_keywords(tmdb_id):
    
    base_url = 'https://www.themoviedb.org/tv/'
    full_url = base_url + str(tmdb_id)

    req = Request(full_url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, "html.parser")

    # Technique reference to avoid 403 error: 
    # https://medium.com/@raiyanquaium/how-to-web-scrape-using-beautiful-soup-in-python-without-running-into-http-error-403-554875e5abed

    keywords_list = []
    keyword_sidebar = soup.find('section', attrs='keywords right_column')
    for li in keyword_sidebar.find_all('li'):
        keywords_list.append(li.text)

    return ", ".join(keywords_list)

In [9]:
def make_tmdb_entry(row_index, input_df):
    imdb_href = input_df['href'][row_index]
    entry = {'imdb_href' : imdb_href}
    external_id = imdb_href[7:-1]

    try:
        api_pull_dict = tmdb_find(external_id, my_api_key)
        tmdb_id = api_pull_dict['tmdb_id']

        entry.update(api_pull_dict)
        entry.update( tmdb_tv_query(tmdb_id, my_api_key) )
        entry['tmdb_keywords'] = scrape_keywords(tmdb_id)
    except:
        print(f"Error on row_index: {row_index} ; imdb_href: {imdb_href}")
        pass

    return entry

#### Create Dataframe of TMDB entries, based on list of shows from IMDB

In [10]:
tmdb_articles = {}

# for i in tqdm(range(len(no_dupes))):
for i in tqdm(range(len(no_dupes))):    
    tmdb_articles[i] = make_tmdb_entry(i, no_dupes)

  2%|▏         | 48/2484 [00:34<27:45,  1.46it/s]

Error on row_index: 48 ; imdb_href: /title/tt1758429/


  2%|▏         | 50/2484 [00:34<22:15,  1.82it/s]

Error on row_index: 50 ; imdb_href: /title/tt0214341/


  5%|▍         | 122/2484 [01:34<27:55,  1.41it/s]

Error on row_index: 121 ; imdb_href: /title/tt0280249/


  7%|▋         | 162/2484 [02:11<26:59,  1.43it/s]

Error on row_index: 161 ; imdb_href: /title/tt1598754/


  9%|▉         | 225/2484 [03:09<25:37,  1.47it/s]

Error on row_index: 224 ; imdb_href: /title/tt13623148/


 12%|█▏        | 289/2484 [04:07<23:55,  1.53it/s]

Error on row_index: 288 ; imdb_href: /title/tt0249327/


 13%|█▎        | 335/2484 [04:49<25:23,  1.41it/s]

Error on row_index: 334 ; imdb_href: /title/tt14986406/


 15%|█▍        | 369/2484 [05:20<26:00,  1.36it/s]

Error on row_index: 368 ; imdb_href: /title/tt0387736/


 15%|█▌        | 375/2484 [05:25<25:17,  1.39it/s]

Error on row_index: 374 ; imdb_href: /title/tt5497534/


 17%|█▋        | 421/2484 [06:09<25:05,  1.37it/s]

Error on row_index: 420 ; imdb_href: /title/tt0114327/


 19%|█▉        | 468/2484 [06:53<25:04,  1.34it/s]

Error on row_index: 467 ; imdb_href: /title/tt8213522/


 24%|██▍       | 590/2484 [08:58<30:16,  1.04it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 29%|██▉       | 730/2484 [11:17<18:28,  1.58it/s]

Error on row_index: 729 ; imdb_href: /title/tt0063939/


 30%|███       | 751/2484 [11:33<17:31,  1.65it/s]

Error on row_index: 750 ; imdb_href: /title/tt0060019/


 31%|███       | 766/2484 [11:44<18:33,  1.54it/s]

Error on row_index: 765 ; imdb_href: /title/tt0062550/


 32%|███▏      | 794/2484 [12:06<17:28,  1.61it/s]

Error on row_index: 793 ; imdb_href: /title/tt0184175/


 33%|███▎      | 820/2484 [12:26<16:52,  1.64it/s]

Error on row_index: 819 ; imdb_href: /title/tt0211145/


 35%|███▌      | 874/2484 [13:08<16:24,  1.64it/s]

Error on row_index: 873 ; imdb_href: /title/tt6601082/


 36%|███▌      | 885/2484 [13:17<16:37,  1.60it/s]

Error on row_index: 884 ; imdb_href: /title/tt0086719/


 42%|████▏     | 1042/2484 [15:27<15:10,  1.58it/s]

Error on row_index: 1041 ; imdb_href: /title/tt14650074/


 44%|████▍     | 1104/2484 [16:17<13:58,  1.65it/s]

Error on row_index: 1103 ; imdb_href: /title/tt0096548/


 45%|████▍     | 1116/2484 [16:26<14:45,  1.55it/s]

Error on row_index: 1115 ; imdb_href: /title/tt0088484/


 48%|████▊     | 1183/2484 [17:22<13:12,  1.64it/s]

Error on row_index: 1182 ; imdb_href: /title/tt4202274/


 48%|████▊     | 1195/2484 [17:32<17:11,  1.25it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 56%|█████▌    | 1385/2484 [20:17<13:03,  1.40it/s]

Error on row_index: 1384 ; imdb_href: /title/tt4093826/


 63%|██████▎   | 1555/2484 [22:49<08:16,  1.87it/s]

Error on row_index: 1553 ; imdb_href: /title/tt6111552/
Error on row_index: 1554 ; imdb_href: /title/tt0090509/


 65%|██████▌   | 1625/2484 [23:50<09:35,  1.49it/s]

Error on row_index: 1624 ; imdb_href: /title/tt6205862/


 66%|██████▌   | 1641/2484 [24:04<10:02,  1.40it/s]

Error on row_index: 1640 ; imdb_href: /title/tt8010592/


 69%|██████▉   | 1719/2484 [25:11<08:27,  1.51it/s]

Error on row_index: 1718 ; imdb_href: /title/tt21867596/


 77%|███████▋  | 1925/2484 [28:14<06:16,  1.49it/s]

Error on row_index: 1924 ; imdb_href: /title/tt0094500/


 78%|███████▊  | 1941/2484 [28:27<06:20,  1.43it/s]

Error on row_index: 1940 ; imdb_href: /title/tt0389680/


 80%|███████▉  | 1984/2484 [29:05<05:35,  1.49it/s]

Error on row_index: 1983 ; imdb_href: /title/tt7686464/


 80%|████████  | 1996/2484 [29:15<05:42,  1.42it/s]

Error on row_index: 1995 ; imdb_href: /title/tt1635327/


 83%|████████▎ | 2067/2484 [30:18<04:54,  1.42it/s]

Error on row_index: 2066 ; imdb_href: /title/tt13720112/


 87%|████████▋ | 2155/2484 [31:38<03:36,  1.52it/s]

Error on row_index: 2154 ; imdb_href: /title/tt8427140/


 89%|████████▉ | 2217/2484 [32:33<03:00,  1.48it/s]

Error on row_index: 2216 ; imdb_href: /title/tt9566030/


100%|██████████| 2484/2484 [36:33<00:00,  1.13it/s]


In [11]:
tmdb_articles_df = pd.DataFrame.from_dict(tmdb_articles, orient='index')

In [12]:
tmdb_articles_df.tail()

Unnamed: 0,imdb_href,tmdb_id,tmdb_name,original_name,original_language,origin_country,tmdb_popularity,tmdb_vote_average,tmdb_vote_count,first_air_date,tmdb_adult_content,tmdb_poster_path,tmdb_overview,tmdb_tagline,tmdb_genres,tv_networks,tmdb_keywords
2479,/title/tt0047736/,3713.0,Gunsmoke,Gunsmoke,en,[US],24.178,6.671,82.0,1955-09-10,False,/x7WjYizqcrA64xi9XLqUWCz5DNk.jpg,Gunsmoke is an American radio and television W...,,"Western, Action & Adventure",CBS,"marshal, kansas, usa, wild west, 19th century"
2480,/title/tt0068093/,1472.0,Kung Fu,Kung Fu,en,[US],0.657,7.611,166.0,1972-10-14,False,/vUqmS40PRyAe3COHh7wHpDFnspC.jpg,The adventures of a Shaolin Monk as he wanders...,,"Action & Adventure, Drama, Western, Sci-Fi & F...",ABC,"martial arts, kung fu, shaolin monk, wild west"
2481,/title/tt0482857/,20056.0,Broken Trail,Broken Trail,en,[US],9.395,6.899,149.0,2006-06-25,False,/yjYXwnRLZFrWXdS97IoXxsSbADi.jpg,Broken Trail is a 2006 Western miniseries dire...,,"Action & Adventure, Drama, Western",AMC,"widow, brothel, miniseries, nephew, american w..."
2482,/title/tt0058855/,10929.0,The Wild Wild West,The Wild Wild West,en,[US],31.67,7.33,50.0,1965-09-17,False,/4LSZNCSxDodwCP8EsKRTgMDUKY4.jpg,The Wild Wild West is an American television s...,,"Action & Adventure, Comedy, Drama, Western, Sc...",CBS,"secret agent, wild west"
2483,/title/tt8873996/,89498.0,Deputy,Deputy,en,[US],12.355,7.171,76.0,2020-01-02,False,/dlnbNP169odKEYKCfzBipRrUDdk.jpg,"When the Los Angeles Countys Sheriff dies, an ...",The hero no one saw coming.,"Western, Drama, Crime",FOX,


In [13]:
tmdb_articles_df.shape

(2484, 17)

#### Handle outlier cases

In [14]:
tmdb_articles_df.isna().sum()

imdb_href              0
tmdb_id               35
tmdb_name             35
original_name         35
original_language     35
origin_country        35
tmdb_popularity       35
tmdb_vote_average     35
tmdb_vote_count       35
first_air_date        35
tmdb_adult_content    35
tmdb_poster_path      36
tmdb_overview         35
tmdb_tagline          35
tmdb_genres           35
tv_networks           35
tmdb_keywords         35
dtype: int64

In [15]:
tmdb_articles_df[tmdb_articles_df['tmdb_genres'].isna()]['imdb_href']

48       /title/tt1758429/
50       /title/tt0214341/
121      /title/tt0280249/
161      /title/tt1598754/
224     /title/tt13623148/
288      /title/tt0249327/
334     /title/tt14986406/
368      /title/tt0387736/
374      /title/tt5497534/
420      /title/tt0114327/
467      /title/tt8213522/
729      /title/tt0063939/
750      /title/tt0060019/
765      /title/tt0062550/
793      /title/tt0184175/
819      /title/tt0211145/
873      /title/tt6601082/
884      /title/tt0086719/
1041    /title/tt14650074/
1103     /title/tt0096548/
1115     /title/tt0088484/
1182     /title/tt4202274/
1384     /title/tt4093826/
1553     /title/tt6111552/
1554     /title/tt0090509/
1624     /title/tt6205862/
1640     /title/tt8010592/
1718    /title/tt21867596/
1924     /title/tt0094500/
1940     /title/tt0389680/
1983     /title/tt7686464/
1995     /title/tt1635327/
2066    /title/tt13720112/
2154     /title/tt8427140/
2216     /title/tt9566030/
Name: imdb_href, dtype: object

In [16]:
#manual lookup for fixes

outlier_cases = {
    # row    imdb_href        tmdb_id
    # 48	:	""   , #   /title/tt1758429/
    50	 :  12971  , #       /title/tt0214341/
    121	 :  12609  , #       /title/tt0280249/
    # 161	:   ""     , #   /title/tt1598754/
    # 224	:  1010901 , #   /title/tt13623148/
    288	 : 36406   , #   /title/tt0249327/ 
    # 334	:  ""       , #  /title/tt14986406/
    # 368	  : 1085199 , #      /title/tt0387736/
    # 374	:   ""      , #  /title/tt5497534/
    420	 :  3570   , #   /title/tt0114327/
    # 467	:   ""     , #   /title/tt8213522/
    729	 :  114772 , #   /title/tt0063939/
    # 750	:   ""     , #   /title/tt0060019/
    # 765	:   ""     , #   /title/tt0062550/
    # 793	:   ""     , #   /title/tt0184175/
    819	:   31654  , #   /title/tt0211145/
    873	 :  81026  , #   /title/tt6601082/
    884	 :  29200  , #   /title/tt0086719/
    1041	:  135670 , #    /title/tt14650074/
    1103	:   7246  , #    /title/tt0096548/
    1115	:   7246  , #    /title/tt0088484/
    # 1182	:   ""    , #    /title/tt4202274/
    1384	:   1920  , #    /title/tt4093826/
    # 1553	:   ""     , #   /title/tt6111552/
    # 1554	:   ""    , #    /title/tt0090509/
    1624	:   64513  , #     /title/tt6205862/
    1640	:   64513  , #   /title/tt8010592/
    1718	:  208720  , #   /title/tt21867596/
    1924	:   13793  , #   /title/tt0094500/
    1940	:   42135  , #   /title/tt0389680/
    # 1983	:   ""     , #   /title/tt7686464/
    # 1995	:   ""     , #   /title/tt1635327/
    2066	:  131488  , #   /title/tt13720112/
    2178	:  134949  , #   /title/tt14153686/
    # 2216	:   ""     , #   /title/tt9566030/
}

In [17]:
len(outlier_cases)

19

In [18]:
def scrape_outlier(tmdb_id, my_api_key=my_api_key):

    # print(f"Attempting to scrape tmdb_id: {tmdb_id}")

    base_url = 'https://api.themoviedb.org/3/tv/'
    url = f"{base_url}{tmdb_id}?api_key={my_api_key}"

    params = {
            'language': 'en-US',
        }

    response = requests.get(url, params=params)
    data = response.json()

    entry = {}

    try:
        data_to_keep = {
            'tmdb_id' : int(data['id']),
            'tmdb_name' : data['name'],
            'original_name' : data['original_name'],
            'original_language' : data['original_language'],        
            'origin_country' : data['origin_country'],
            'tmdb_popularity' : data['popularity'],
            'tmdb_vote_average' : data['vote_average'],
            'tmdb_vote_count' : int(data['vote_count']),
            'first_air_date' : data['first_air_date'],
            'tmdb_adult_content' : data['adult'],
            'tmdb_poster_path' : data['poster_path'],
            'tmdb_overview' : data['overview'].encode("ascii", "ignore").decode(), 
        }
        
        entry.update(data_to_keep)
        entry.update( tmdb_tv_query(tmdb_id, my_api_key) )
        entry['tmdb_keywords'] = scrape_keywords(tmdb_id)
    except:
        
        data_to_keep = {
            'tmdb_id' : "",
            'tmdb_name' : "",
            'original_name' : "",
            'original_language' : "",      
            'origin_country' : "",
            'tmdb_popularity' : "",
            'tmdb_vote_average' : "",
            'tmdb_vote_count' : "",
            'first_air_date' : "",
            'tmdb_adult_content' : "",
            'tmdb_poster_path' : "",
            'tmdb_overview' : "",
            'tmdb_tagline' : "",
            'tmdb_genres' : "",
            'tv_networks' : "",
            'tmdb_keywords' : "",
        }
        entry.update(data_to_keep)

    return entry

In [19]:
for i in tqdm(outlier_cases):
    tmdb_articles[i].update( scrape_outlier(outlier_cases[i], my_api_key) )

100%|██████████| 19/19 [00:12<00:00,  1.47it/s]


In [20]:
tmdb_articles_df = pd.DataFrame.from_dict(tmdb_articles, orient='index')

In [21]:
tmdb_articles_df.fillna("", inplace=True)

#### Write finished dataframe to disk

In [22]:
tmdb_articles_df.to_csv('../data/tmdb_articles_df.csv', index=False)

In [23]:
# image_base_url = 'https://image.tmdb.org/t/p/original'
# test_image_url = image_base_url + asdf['poster_path']
# test_image_url