In [1]:
# imports and client instance creation with apikey
import omdb
import pandas as pd
from omdb import OMDBClient
client = OMDBClient(apikey='e55a0e19')

In [2]:
# read csv and check structure
csv_data = pd.read_csv('C:\My\Workspace\Python_Projects\Movies_Data\input_files\\film_list.csv')
csv_data

Unnamed: 0,Title,Year_Released
0,Ambulance,2022
1,Blade Runner 2049,2017
2,Dune,2021
3,Edge Of Tomorrow,2014
4,Everest,2015
5,"Everything, Everywhere, All At Once",2022
6,Ford v Ferrari,2019
7,Interstellar,2014
8,Mad Max: Fury Road,2015
9,No Time To Die,2021


In [3]:
# helper function to return search results
def get_film_object(row):
    film_title = row['Title']
    film_year_released = str(row['Year_Released'])
    film_object = client.search(film_title, year=film_year_released)
    return film_object[0]

# creates a python object to end up making a csv file of the returned films
csv_data['film_object'] = csv_data.apply(get_film_object, axis=1)
film_data = csv_data['film_object'].tolist()

In [4]:
# create a dataframe object from the list
full_df = pd.DataFrame(film_data)
full_df.reset_index()
full_df.head(30)

Unnamed: 0,title,year,imdb_id,type,poster
0,Ambulance,2022,tt4998632,movie,https://m.media-amazon.com/images/M/MV5BYjUyN2...
1,Blade Runner 2049,2017,tt1856101,movie,https://m.media-amazon.com/images/M/MV5BNzA1Nj...
2,Dune,2021,tt1160419,movie,https://m.media-amazon.com/images/M/MV5BN2FjNm...
3,Edge of Tomorrow,2014,tt1631867,movie,https://m.media-amazon.com/images/M/MV5BMTc5OT...
4,Everest,2015,tt2719848,movie,https://m.media-amazon.com/images/M/MV5BMTNmMz...
5,Everything Everywhere All at Once,2022,tt6710474,movie,https://m.media-amazon.com/images/M/MV5BYTdiOT...
6,Ford v Ferrari,2019,tt1950186,movie,https://m.media-amazon.com/images/M/MV5BM2UwMD...
7,Interstellar,2014,tt0816692,movie,https://m.media-amazon.com/images/M/MV5BZjdkOT...
8,Mad Max: Fury Road,2015,tt1392190,movie,https://m.media-amazon.com/images/M/MV5BN2EwM2...
9,No Time to Die,2021,tt2382320,movie,https://m.media-amazon.com/images/M/MV5BYWQ2Nz...


In [5]:
full_df.to_csv(r'C:\My\Workspace\Python_Projects\Movies_Data\output_files\\movie_list.csv', sep='|', index=False)

In [6]:
# helper function to extract the unique imdb ids from the list
def extract_column_to_list(df, column_name):
    return df[column_name].tolist()

# usage
imdb_id_list = extract_column_to_list(full_df, 'imdb_id')

In [7]:
# example URL: https://www.omdbapi.com/?i=tt2382320&apikey=e55a0e19
# create list of API endpoints for further lookups
url_list = []
import urllib.parse
for imdbId in imdb_id_list:
    url = 'https://www.omdbapi.com/?'
    params = {'i': imdbId, 'apikey': 'e55a0e19'}
    temp_url = str(url + urllib.parse.urlencode(params))
    url_list.append(temp_url)

In [52]:
# create a list where each row is all the movie data found from each key
import requests
# import json
data_list = []
for api_url in url_list:
    resp = requests.get(api_url)
    resp.raise_for_status()
    data = resp.json()
    for d in data["Ratings"]:
        temp_dict = {d["Source"].replace(" ", ""):d["Value"]}
        data.update(temp_dict.items())
    data_list.append(data)

#create a dataframe from the pre-loaded list
data_list_df = pd.DataFrame(data_list)

In [53]:
data_list

[{'Title': 'Ambulance',
  'Year': '2022',
  'Rated': 'R',
  'Released': '08 Apr 2022',
  'Runtime': '136 min',
  'Genre': 'Action, Crime, Drama',
  'Director': 'Michael Bay',
  'Writer': 'Chris Fedak, Laurits Munch-Petersen, Lars Andreas Pedersen',
  'Actors': 'Jake Gyllenhaal, Yahya Abdul-Mateen II, Eiza González',
  'Plot': 'Two robbers steal an ambulance after their heist goes awry.',
  'Language': 'English, Spanish',
  'Country': 'United States, Japan',
  'Awards': '10 nominations',
  'Poster': 'https://m.media-amazon.com/images/M/MV5BYjUyN2VlZGEtNGEyZC00YjViLTgwYmQtZDJiM2FlOTU3Mjg2XkEyXkFqcGdeQXVyMjMxOTE0ODA@._V1_SX300.jpg',
  'Ratings': [{'Source': 'Internet Movie Database', 'Value': '6.1/10'},
   {'Source': 'Rotten Tomatoes', 'Value': '68%'},
   {'Source': 'Metacritic', 'Value': '55/100'}],
  'Metascore': '55',
  'imdbRating': '6.1',
  'imdbVotes': '81,221',
  'imdbID': 'tt4998632',
  'Type': 'movie',
  'DVD': '24 May 2022',
  'BoxOffice': '$22,781,115',
  'Production': 'N/A',
 

In [31]:
"""
ratings_list = []

for rec in data_list:
    mrats = pd.DataFrame(rec["Ratings"])
    mrats["imdbID"] = rec["imdbID"]
    ratings_list.append(mrats)

ratings_list
# c_list = ["imdbID", ]
# r_pd = pd.DataFrame(ratings_list, index=ratings_list["imdbID"])
ratings_list_cat = pd.concat(ratings_list)
# ratings_list_merge = pd.merge(ratings_list, ignore_index=True)
ndata = pd.merge(data_list_df, ratings_list, on="imdbID" )
ndata.info()
"""

In [9]:
# data_list
# data_list[0]["Ratings"]
# create list of actors
movie_actors = [dl["Actors"] for dl in data_list if "Actors" in dl]

In [10]:
# movie_actors
from collections import Counter
counts = Counter(movie_actors)
counts.most_common()

[('Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss', 3),
 ('Shia LaBeouf, Megan Fox, Josh Duhamel', 2),
 ('Jake Gyllenhaal, Yahya Abdul-Mateen II, Eiza González', 1),
 ('Harrison Ford, Ryan Gosling, Ana de Armas', 1),
 ('Timothée Chalamet, Rebecca Ferguson, Zendaya', 1),
 ('Tom Cruise, Emily Blunt, Bill Paxton', 1),
 ('Jason Clarke, Ang Phula Sherpa, Thomas M. Wright', 1),
 ('Michelle Yeoh, Stephanie Hsu, Jamie Lee Curtis', 1),
 ('Matt Damon, Christian Bale, Jon Bernthal', 1),
 ('Matthew McConaughey, Anne Hathaway, Jessica Chastain', 1),
 ('Tom Hardy, Charlize Theron, Nicholas Hoult', 1),
 ('Daniel Craig, Ana de Armas, Rami Malek', 1),
 ('Tye Sheridan, Olivia Cooke, Ben Mendelsohn', 1),
 ('Ben Schwartz, James Marsden, Jim Carrey', 1),
 ('James Marsden, Jim Carrey, Ben Schwartz', 1),
 ('Jamie Foxx, Tina Fey, Graham Norton', 1),
 ('Arnold Schwarzenegger, Linda Hamilton, Edward Furlong', 1),
 ('Robert Pattinson, Zoë Kravitz, Jeffrey Wright', 1),
 ('Keanu Reeves, Carrie-Anne Moss, Yahya

In [56]:
# the commented out column_list list object represents the complete and full list of columns and is kept as a point of reference to return to
# column_list=["Title","Year","Rated","Released","Runtime","Genre","Director","Writer","Actors","Plot","Language","Country","Awards","Poster","Ratings","Metascore","imdbRating","imdbVotes","imdbID","Type","DVD","BoxOffice","Production","Website","Response","InternetMovieDatabase","RottenTomatoes","Metacritic"]
column_list=["Title","Year","Rated","Released","Runtime","Genre","Director","Writer","Actors","Plot","Language","Country","Awards","Poster","Ratings","Metascore","imdbRating","imdbVotes","InternetMovieDatabase","RottenTomatoes","Metacritic","imdbID","Type","DVD","BoxOffice","Production","Website"]
# create a dataframe object based on the previous one where only the relevant columns are used
data_list_df_partial = pd.DataFrame(data_list_df, columns=column_list)

In [57]:
data_list_df_partial.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbVotes,InternetMovieDatabase,RottenTomatoes,Metacritic,imdbID,Type,DVD,BoxOffice,Production,Website
0,Ambulance,2022,R,08 Apr 2022,136 min,"Action, Crime, Drama",Michael Bay,"Chris Fedak, Laurits Munch-Petersen, Lars Andr...","Jake Gyllenhaal, Yahya Abdul-Mateen II, Eiza G...",Two robbers steal an ambulance after their hei...,...,81221,6.1/10,68%,55/100,tt4998632,movie,24 May 2022,"$22,781,115",,
1,Blade Runner 2049,2017,R,06 Oct 2017,164 min,"Action, Drama, Mystery",Denis Villeneuve,"Hampton Fancher, Michael Green, Philip K. Dick","Harrison Ford, Ryan Gosling, Ana de Armas",Young Blade Runner K's discovery of a long-bur...,...,599573,8.0/10,88%,81/100,tt1856101,movie,16 Jan 2018,"$92,071,675",,
2,Dune,2021,PG-13,22 Oct 2021,155 min,"Action, Adventure, Drama",Denis Villeneuve,"Jon Spaihts, Denis Villeneuve, Eric Roth","Timothée Chalamet, Rebecca Ferguson, Zendaya",A noble family becomes embroiled in a war for ...,...,665392,8.0/10,83%,74/100,tt1160419,movie,22 Oct 2021,"$108,327,830",,
3,Edge of Tomorrow,2014,PG-13,06 Jun 2014,113 min,"Action, Adventure, Sci-Fi",Doug Liman,"Christopher McQuarrie, Jez Butterworth, John-H...","Tom Cruise, Emily Blunt, Bill Paxton",A soldier fighting aliens gets to relive the s...,...,696962,7.9/10,91%,71/100,tt1631867,movie,07 Oct 2014,"$100,206,256",,
4,Everest,2015,PG-13,25 Sep 2015,121 min,"Action, Adventure, Biography",Baltasar Kormákur,"William Nicholson, Simon Beaufoy","Jason Clarke, Ang Phula Sherpa, Thomas M. Wright","The story of New Zealand mountaineer Rob Hall,...",...,223452,7.1/10,73%,64/100,tt2719848,movie,19 Jan 2016,"$43,482,270",,


In [58]:
data_list_df_partial.describe()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,imdbVotes,InternetMovieDatabase,RottenTomatoes,Metacritic,imdbID,Type,DVD,BoxOffice,Production,Website
count,46,46,46,46,46,46,46,46,46,46,...,46,46,46,46,46,46,46,46,46.0,46.0
unique,46,25,4,44,34,20,33,41,43,46,...,46,27,33,29,46,1,45,46,1.0,1.0
top,Ambulance,2022,PG-13,08 Apr 2022,113 min,"Action, Adventure, Sci-Fi",Michael Bay,"J.R.R. Tolkien, Fran Walsh, Philippa Boyens","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",Two robbers steal an ambulance after their hei...,...,81221,6.7/10,73%,81/100,tt4998632,movie,24 May 2022,"$22,781,115",,
freq,1,5,21,3,4,10,6,3,3,1,...,1,3,4,5,1,46,2,1,46.0,46.0


In [59]:
data_list_df_partial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Title                  46 non-null     object
 1   Year                   46 non-null     object
 2   Rated                  46 non-null     object
 3   Released               46 non-null     object
 4   Runtime                46 non-null     object
 5   Genre                  46 non-null     object
 6   Director               46 non-null     object
 7   Writer                 46 non-null     object
 8   Actors                 46 non-null     object
 9   Plot                   46 non-null     object
 10  Language               46 non-null     object
 11  Country                46 non-null     object
 12  Awards                 46 non-null     object
 13  Poster                 46 non-null     object
 14  Ratings                46 non-null     object
 15  Metascore              46

In [60]:
data_list_df_partial.to_csv(r'C:\My\Workspace\Python_Projects\Movies_Data\output_files\\movie_list_detail.csv',sep='|', index=False)

In [39]:
# transformation of each of the columns
data_list_df_partial['Title'] = data_list_df_partial['Title'].astype("string")
data_list_df_partial['Year'] = pd.to_numeric(data_list_df_partial['Year'])
data_list_df_partial['Rated'] = data_list_df_partial['Rated'].astype("string")
data_list_df_partial['Released'] = pd.to_datetime(data_list_df_partial['Released'], dayfirst=True, errors="raise", format='%d %b %Y')
data_list_df_partial['Runtime'] = data_list_df_partial['Runtime'].astype("string")
data_list_df_partial['Genre'] = data_list_df_partial['Genre'].astype("string") #multiple
data_list_df_partial['Director'] = data_list_df_partial['Director'].astype("string") #multiple
data_list_df_partial['Writer'] = data_list_df_partial['Writer'].astype("string") #multiple
data_list_df_partial['Actors'] = data_list_df_partial['Actors'].astype("string") #multiple
data_list_df_partial['Plot'] = data_list_df_partial['Plot'].astype("string")
data_list_df_partial['Language'] = data_list_df_partial['Language'].astype("string") #multiple
data_list_df_partial['Country'] = data_list_df_partial['Country'].astype("string") #multiple
data_list_df_partial['Awards'] = data_list_df_partial['Awards'].astype("string")
data_list_df_partial['Poster'] = data_list_df_partial['Poster'].astype("string")
data_list_df_partial['Ratings'] = data_list_df_partial['Ratings'].astype("string") #JSON object
data_list_df_partial['Metascore'] = data_list_df_partial['Metascore'].astype("string") #int
data_list_df_partial['imdbRating'] = data_list_df_partial['imdbRating'].astype("string") #float or decimal
data_list_df_partial['imdbVotes'] = data_list_df_partial['imdbVotes'].astype("string") #int
data_list_df_partial['imdbID'] = data_list_df_partial['imdbID'].astype("string")
data_list_df_partial['Type'] = data_list_df_partial['Type'].astype("string")
data_list_df_partial['DVD'] = pd.to_datetime(data_list_df_partial['DVD'], dayfirst=True, errors="raise", format='%d %b %Y')
data_list_df_partial['BoxOffice'] = data_list_df_partial['BoxOffice'].astype("string") #money
data_list_df_partial['Production'] = data_list_df_partial['Production'].astype("string") #all come nback 'N/A'
data_list_df_partial['Website'] = data_list_df_partial['Website'].astype("string") #all come nback 'N/A'
data_list_df_partial.to_csv(r'C:\My\Workspace\Python_Projects\Movies_Data\output_files\\movie_list_detail_postTransform.csv',sep='|', index=False)

In [67]:
data_list_df_partial.head(15)

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website
0,Ambulance,2022,R,08 Apr 2022,136 min,"Action, Crime, Drama",Michael Bay,"Chris Fedak, Laurits Munch-Petersen, Lars Andr...","Jake Gyllenhaal, Yahya Abdul-Mateen II, Eiza G...",Two robbers steal an ambulance after their hei...,...,"[{'Source': 'Internet Movie Database', 'Value'...",55,6.1,81221,tt4998632,movie,24 May 2022,"$22,781,115",,
1,Blade Runner 2049,2017,R,06 Oct 2017,164 min,"Action, Drama, Mystery",Denis Villeneuve,"Hampton Fancher, Michael Green, Philip K. Dick","Harrison Ford, Ryan Gosling, Ana de Armas",Young Blade Runner K's discovery of a long-bur...,...,"[{'Source': 'Internet Movie Database', 'Value'...",81,8.0,599573,tt1856101,movie,16 Jan 2018,"$92,071,675",,
2,Dune,2021,PG-13,22 Oct 2021,155 min,"Action, Adventure, Drama",Denis Villeneuve,"Jon Spaihts, Denis Villeneuve, Eric Roth","Timothée Chalamet, Rebecca Ferguson, Zendaya",A noble family becomes embroiled in a war for ...,...,"[{'Source': 'Internet Movie Database', 'Value'...",74,8.0,665392,tt1160419,movie,22 Oct 2021,"$108,327,830",,
3,Edge of Tomorrow,2014,PG-13,06 Jun 2014,113 min,"Action, Adventure, Sci-Fi",Doug Liman,"Christopher McQuarrie, Jez Butterworth, John-H...","Tom Cruise, Emily Blunt, Bill Paxton",A soldier fighting aliens gets to relive the s...,...,"[{'Source': 'Internet Movie Database', 'Value'...",71,7.9,696962,tt1631867,movie,07 Oct 2014,"$100,206,256",,
4,Everest,2015,PG-13,25 Sep 2015,121 min,"Action, Adventure, Biography",Baltasar Kormákur,"William Nicholson, Simon Beaufoy","Jason Clarke, Ang Phula Sherpa, Thomas M. Wright","The story of New Zealand mountaineer Rob Hall,...",...,"[{'Source': 'Internet Movie Database', 'Value'...",64,7.1,223452,tt2719848,movie,19 Jan 2016,"$43,482,270",,
5,Everything Everywhere All at Once,2022,R,08 Apr 2022,139 min,"Action, Adventure, Comedy","Daniel Kwan, Daniel Scheinert","Daniel Kwan, Daniel Scheinert","Michelle Yeoh, Stephanie Hsu, Jamie Lee Curtis",A middle-aged Chinese immigrant is swept up in...,...,"[{'Source': 'Internet Movie Database', 'Value'...",81,7.9,428822,tt6710474,movie,07 Jun 2022,"$77,169,469",,
6,Ford v Ferrari,2019,PG-13,15 Nov 2019,152 min,"Action, Biography, Drama",James Mangold,"Jez Butterworth, John-Henry Butterworth, Jason...","Matt Damon, Christian Bale, Jon Bernthal",American car designer Carroll Shelby and drive...,...,"[{'Source': 'Internet Movie Database', 'Value'...",81,8.1,414884,tt1950186,movie,15 Nov 2019,"$117,624,357",,
7,Interstellar,2014,PG-13,07 Nov 2014,169 min,"Adventure, Drama, Sci-Fi",Christopher Nolan,"Jonathan Nolan, Christopher Nolan","Matthew McConaughey, Anne Hathaway, Jessica Ch...",A team of explorers travel through a wormhole ...,...,"[{'Source': 'Internet Movie Database', 'Value'...",74,8.6,1898118,tt0816692,movie,31 Mar 2015,"$188,020,017",,
8,Mad Max: Fury Road,2015,R,15 May 2015,120 min,"Action, Adventure, Sci-Fi",George Miller,"George Miller, Brendan McCarthy, Nick Lathouris","Tom Hardy, Charlize Theron, Nicholas Hoult","In a post-apocalyptic wasteland, a woman rebel...",...,"[{'Source': 'Internet Movie Database', 'Value'...",90,8.1,1026101,tt1392190,movie,01 Sep 2015,"$154,109,060",,
9,No Time to Die,2021,PG-13,08 Oct 2021,163 min,"Action, Adventure, Thriller",Cary Joji Fukunaga,"Neal Purvis, Robert Wade, Cary Joji Fukunaga","Daniel Craig, Ana de Armas, Rami Malek",James Bond has left active service. His peace ...,...,"[{'Source': 'Internet Movie Database', 'Value'...",68,7.3,415836,tt2382320,movie,09 Nov 2021,"$160,891,007",,


In [68]:
data_list_df_partial.describe()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website
count,22,22,22,22,22,22,22,22,22,22,...,22,22,22.0,22,22,22,22,22,22.0,22.0
unique,22,12,3,20,21,12,18,20,20,22,...,22,16,15.0,22,22,1,21,22,1.0,1.0
top,Ambulance,2022,R,08 Apr 2022,136 min,"Action, Sci-Fi","Lana Wachowski, Lilly Wachowski","Lilly Wachowski, Lana Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",Two robbers steal an ambulance after their hei...,...,"[{'Source': 'Internet Movie Database', 'Value'...",81,8.0,81221,tt4998632,movie,24 May 2022,"$22,781,115",,
freq,1,5,9,3,2,5,3,3,3,1,...,1,3,3.0,1,1,22,2,1,22.0,22.0


In [69]:
data_list_df_partial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 24 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       22 non-null     object
 1   Year        22 non-null     object
 2   Rated       22 non-null     object
 3   Released    22 non-null     object
 4   Runtime     22 non-null     object
 5   Genre       22 non-null     object
 6   Director    22 non-null     object
 7   Writer      22 non-null     object
 8   Actors      22 non-null     object
 9   Plot        22 non-null     object
 10  Language    22 non-null     object
 11  Country     22 non-null     object
 12  Awards      22 non-null     object
 13  Poster      22 non-null     object
 14  Ratings     22 non-null     object
 15  Metascore   22 non-null     object
 16  imdbRating  22 non-null     object
 17  imdbVotes   22 non-null     object
 18  imdbID      22 non-null     object
 19  Type        22 non-null     object
 20  DVD         

In [70]:
data_list_df_partial.to_csv(r'C:\My\Workspace\Python_Projects\Movies_Data\output_files\\movie_list_detail.csv',sep='|', index=False)