In [10]:
import pandas as pd
import requests

In [11]:
# Specify the path to your CSV file
csv_file_path = 'Resources/movies_raw_imdb.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Drop all columns besides 'tconst'
df = df[['tconst']]

In [12]:
api_key = 'ac6cb27f'

#Base URL for OMDB API
omdb_url = 'http://www.omdbapi.com/'

def get_movie_details(imdb_id):
    params = {'apikey': api_key, 'i': imdb_id}
    response = requests.get(omdb_url, params=params)
    data = response.json()
    return data

# Define a list of fields you want to extract from the API response
fields_to_extract = ['Title', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 
                     'Writer', 'Actors', 'Language', 'Country', 'Type', 'Metascore', 
                     'imdbRating', 'imdbVotes', 'BoxOffice', 'Production']

In [13]:
# Iterate through each row in the DataFrame and append selected fields from the API response
for index, row in df.iterrows():
    movie_title = row['tconst']
    movie_details = get_movie_details(movie_title)

    # Append selected fields to the DataFrame
    for field in fields_to_extract:
        column_name = f'OMDB_{field}'
        df.at[index, column_name] = movie_details.get(field, '')

# Display the updated DataFrame
print(df)

ReadTimeout: HTTPConnectionPool(host='www.omdbapi.com', port=80): Read timed out. (read timeout=None)

In [16]:
from requests.exceptions import ReadTimeout
api_key = 'ac6cb27f'
omdb_url = 'http://www.omdbapi.com/'

def get_movie_details(imdb_id):
    params = {'apikey': api_key, 'i': imdb_id}
    
    # Set a maximum number of retries
    max_retries = 3
    
    for _ in range(max_retries):
        try:
            response = requests.get(omdb_url, params=params, timeout=(5, 10))  # Set timeout values as needed
            response.raise_for_status()  # Raise an exception for HTTP errors (4xx and 5xx)
            data = response.json()
            return data
        except ReadTimeout:
            print(f"Read timeout occurred for movie with IMDb ID {imdb_id}. Retrying...")
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            break  # Break out of the loop for other types of exceptions
    
    # Handle the case where the request was unsuccessful after retries
    print(f"Unable to fetch data for movie with IMDb ID {imdb_id}")
    return None

fields_to_extract = ['Title', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 
                     'Writer', 'Actors', 'Language', 'Country', 'Type', 'Metascore', 
                     'imdbRating', 'imdbVotes', 'BoxOffice', 'Production']

# Iterate through each row in the DataFrame and append selected fields from the API response
for index, row in df.iterrows():
    # Check if the relevant fields are already populated
    if all(pd.notna(row[f'OMDB_{field}']) for field in fields_to_extract):
        continue  # Skip making API call for rows where data is already present
    
    movie_title = row['tconst']
    movie_details = get_movie_details(movie_title)

    # Check if movie details were retrieved successfully
    if movie_details:
        # Append selected fields to the DataFrame
        for field in fields_to_extract:
            column_name = f'OMDB_{field}'
            df.at[index, column_name] = movie_details.get(field, '')
    else:
        # Handle the case where movie details were not retrieved
        # You can choose to skip the row, set default values, or handle it in another way
        pass



Read timeout occurred for movie with IMDb ID tt28230767. Retrying...
Read timeout occurred for movie with IMDb ID tt29001578. Retrying...
Read timeout occurred for movie with IMDb ID tt29033317. Retrying...
Read timeout occurred for movie with IMDb ID tt29044029. Retrying...
Read timeout occurred for movie with IMDb ID tt29044029. Retrying...
Read timeout occurred for movie with IMDb ID tt29054192. Retrying...
Read timeout occurred for movie with IMDb ID tt29173194. Retrying...
Read timeout occurred for movie with IMDb ID tt29254618. Retrying...
Read timeout occurred for movie with IMDb ID tt29256454. Retrying...
Read timeout occurred for movie with IMDb ID tt29256454. Retrying...
Read timeout occurred for movie with IMDb ID tt29258480. Retrying...
Read timeout occurred for movie with IMDb ID tt29258480. Retrying...
Read timeout occurred for movie with IMDb ID tt29258671. Retrying...
Read timeout occurred for movie with IMDb ID tt29258696. Retrying...
Read timeout occurred for movie wi

In [17]:
df.to_csv("./Resources/larger_movies_raw.csv")

# Format column data types from objects to numbers

In [29]:
# Examine data type of each column
movies_df=df
movies_df.dtypes

tconst             object
OMDB_Title         object
OMDB_Rated         object
OMDB_Released      object
OMDB_Runtime       object
OMDB_Genre         object
OMDB_Director      object
OMDB_Writer        object
OMDB_Actors        object
OMDB_Language      object
OMDB_Country       object
OMDB_Type          object
OMDB_Metascore     object
OMDB_imdbRating    object
OMDB_imdbVotes     object
OMDB_BoxOffice     object
OMDB_Production    object
dtype: object

In [30]:
# delate all "N/A"
movies_df = movies_df.replace('N/A', '')

# # Convert number strings with comma to float
movies_df[['OMDB_imdbVotes']] = movies_df[['OMDB_imdbVotes']].replace('[,\s]', '', regex=True)
movies_df['OMDB_imdbVotes'] = pd.to_numeric(movies_df['OMDB_imdbVotes'], errors='coerce')

# # Convert runtime to float after removing " min"
movies_df[['OMDB_Runtime']] = movies_df[['OMDB_Runtime']].replace('[min\s]', '', regex=True)
movies_df['OMDB_Runtime'] = pd.to_numeric(movies_df['OMDB_Runtime'], errors='coerce')

# Change date to datetime type (Note "ReleaseDate" has inconsistent formatting / typos)
movies_df["OMDB_Released"] = pd.to_datetime(movies_df["OMDB_Released"])

movies_df.dtypes

tconst                     object
OMDB_Title                 object
OMDB_Rated                 object
OMDB_Released      datetime64[ns]
OMDB_Runtime              float64
OMDB_Genre                 object
OMDB_Director              object
OMDB_Writer                object
OMDB_Actors                object
OMDB_Language              object
OMDB_Country               object
OMDB_Type                  object
OMDB_Metascore             object
OMDB_imdbRating            object
OMDB_imdbVotes            float64
OMDB_BoxOffice             object
OMDB_Production            object
dtype: object

In [31]:
movies_df.to_csv("./Resources/larger_movies_formatted.csv")

# Minor Processing

## Convert dates into year and month columns removing date columns

In [32]:
# Change date to datetime type (Note "ReleaseDate" has inconsistent formatting / typos)
movies_df["OMDB_Released"] = pd.to_datetime(movies_df["OMDB_Released"])
movies_df

Unnamed: 0,tconst,OMDB_Title,OMDB_Rated,OMDB_Released,OMDB_Runtime,OMDB_Genre,OMDB_Director,OMDB_Writer,OMDB_Actors,OMDB_Language,OMDB_Country,OMDB_Type,OMDB_Metascore,OMDB_imdbRating,OMDB_imdbVotes,OMDB_BoxOffice,OMDB_Production
0,tt0013274,Istoriya grazhdanskoy voyny,,2022-05-10,94.0,Documentary,"Nikolai Izvolov, Dziga Vertov",,,,Soviet Union,movie,,,58.0,,
1,tt0015414,La tierra de los toros,,NaT,60.0,,Musidora,,"Antonio Cañero, Musidora",,"Spain, France",movie,,6.6,16.0,,
2,tt0015724,Dama de noche,,1993-03-18,102.0,"Drama, Mystery, Romance",Eva López Sánchez,"Eva López Sánchez, David Martin del Campo","Rafael Sánchez Navarro, Cecilia Toussaint, Mig...",Spanish,Mexico,movie,,5.8,27.0,,
3,tt0035423,Kate & Leopold,PG-13,2001-12-25,118.0,"Comedy, Fantasy, Romance",James Mangold,"Steven Rogers, James Mangold","Meg Ryan, Hugh Jackman, Liev Schreiber","English, French",United States,movie,44,6.4,87977.0,"$47,121,859",
4,tt0036606,"Another Time, Another Place",R,1984-05-11,118.0,"Drama, War",Michael Radford,"Jessie Kesson, John Francis Lane, Michael Radford","Phyllis Logan, Giovanni Mauriello, Gianluca Fa...","English, Italian",United Kingdom,movie,,6.5,338.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221928,tt9916270,Il talento del calabrone,,2020-11-18,84.0,Thriller,Giacomo Cimini,"Giacomo Cimini, Lorenzo Collalti","Sergio Castellitto, Lorenzo Richelmy, Anna Fog...",Italian,"Italy, Spain",movie,,5.8,1448.0,,
221929,tt9916362,Coven,TV-MA,2020-10-02,92.0,"Drama, History",Pablo Agüero,"Pablo Agüero, Katell Guillou","Amaia Aberasturi, Alex Brendemühl, Daniel Fanego","Spanish, Basque","Spain, France, Argentina",movie,,6.4,5603.0,,
221930,tt9916428,The Secret of China,,2019-08-08,,"Adventure, History, War",Jixing Wang,,"Kenan Heppe, Wang Peng Kai, Valery Gadreau","Chinese, English",China,movie,,3.5,17.0,,
221931,tt9916538,Kuambil Lagi Hatiku,,2019-03-21,123.0,Drama,Azhar Kinoi Lubis,"Arief Ash Siddiq, Rino Sarjono, Salman Aristo","Lala Karmela, Cut Mini Theo, Sahil Shah",Indonesian,Indonesia,movie,,,6.0,,


In [33]:
movies_df.dtypes

tconst                     object
OMDB_Title                 object
OMDB_Rated                 object
OMDB_Released      datetime64[ns]
OMDB_Runtime              float64
OMDB_Genre                 object
OMDB_Director              object
OMDB_Writer                object
OMDB_Actors                object
OMDB_Language              object
OMDB_Country               object
OMDB_Type                  object
OMDB_Metascore             object
OMDB_imdbRating            object
OMDB_imdbVotes            float64
OMDB_BoxOffice             object
OMDB_Production            object
dtype: object

In [34]:
movies_df["ReleaseYear"] = pd.DatetimeIndex(movies_df['OMDB_Released']).year
movies_df["ReleaseMonth"] = pd.DatetimeIndex(movies_df['OMDB_Released']).month
movies_df = movies_df.drop(['OMDB_Released'], axis=1)
movies_df.head()

Unnamed: 0,tconst,OMDB_Title,OMDB_Rated,OMDB_Runtime,OMDB_Genre,OMDB_Director,OMDB_Writer,OMDB_Actors,OMDB_Language,OMDB_Country,OMDB_Type,OMDB_Metascore,OMDB_imdbRating,OMDB_imdbVotes,OMDB_BoxOffice,OMDB_Production,ReleaseYear,ReleaseMonth
0,tt0013274,Istoriya grazhdanskoy voyny,,94.0,Documentary,"Nikolai Izvolov, Dziga Vertov",,,,Soviet Union,movie,,,58.0,,,2022.0,5.0
1,tt0015414,La tierra de los toros,,60.0,,Musidora,,"Antonio Cañero, Musidora",,"Spain, France",movie,,6.6,16.0,,,,
2,tt0015724,Dama de noche,,102.0,"Drama, Mystery, Romance",Eva López Sánchez,"Eva López Sánchez, David Martin del Campo","Rafael Sánchez Navarro, Cecilia Toussaint, Mig...",Spanish,Mexico,movie,,5.8,27.0,,,1993.0,3.0
3,tt0035423,Kate & Leopold,PG-13,118.0,"Comedy, Fantasy, Romance",James Mangold,"Steven Rogers, James Mangold","Meg Ryan, Hugh Jackman, Liev Schreiber","English, French",United States,movie,44.0,6.4,87977.0,"$47,121,859",,2001.0,12.0
4,tt0036606,"Another Time, Another Place",R,118.0,"Drama, War",Michael Radford,"Jessie Kesson, John Francis Lane, Michael Radford","Phyllis Logan, Giovanni Mauriello, Gianluca Fa...","English, Italian",United Kingdom,movie,,6.5,338.0,,,1984.0,5.0


In [35]:
movies_df.columns

Index(['tconst', 'OMDB_Title', 'OMDB_Rated', 'OMDB_Runtime', 'OMDB_Genre',
       'OMDB_Director', 'OMDB_Writer', 'OMDB_Actors', 'OMDB_Language',
       'OMDB_Country', 'OMDB_Type', 'OMDB_Metascore', 'OMDB_imdbRating',
       'OMDB_imdbVotes', 'OMDB_BoxOffice', 'OMDB_Production', 'ReleaseYear',
       'ReleaseMonth'],
      dtype='object')

In [36]:
# Define Consolidate Rated Lists:
# reference https://en.wikipedia.org/wiki/Motion_Picture_Association_film_rating_system
kids = ['PG', 'G', 'TV-PG', 'TV-G', 'GP', 'M/PG', 'M']
teens = ['PG-13', 'TV-14', '16+', '13+']
adults = ['R', 'TV-MA', 'NC-17', 'X', '18+']
Unknown = ['Not Rated', 'Approved', 'Unrated', 'Passed']

for index, row in movies_df.iterrows():
    movie_rating = row['OMDB_Rated']
    if movie_rating in kids:
        movies_df.at[index,'OMDB_Rated'] = 'kids'
    elif movie_rating in teens:
        movies_df.at[index,'OMDB_Rated'] = 'teens'
    elif movie_rating in adults:
        movies_df.at[index,'OMDB_Rated'] = 'adults'
    else:
        movies_df.at[index,'OMDB_Rated'] = 'unknown'

# df["grade"] = df.apply(lambda x: 'A+' if ((x['score']>40)&(x['score']<50)) else x["grade"], axis=1)
# movies_df.OMDB_Rated = np.where((movies_df.OMDB_Rated in kids), 'kids', movies_df.OMDB_Rated)

movies_df['OMDB_Rated'].value_counts()

unknown    188478
adults      19047
teens        7918
kids         6490
Name: OMDB_Rated, dtype: int64

In [28]:
movies_df.to_csv("./Resources/larger_movies_processed.csv")