In [None]:
import config
import requests
import pandas as pd
import numpy as np

# Extract movie data

In [None]:
# key and access token from TMDB api
api_key = config.api_key
token = config.access_token

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {token}"
}

In [None]:
# my movie data
ds = pd.read_csv("ratings.csv")

In [None]:
# create empty df for movie with no available data
no_data = ["adult", "backdrop_path", "genre_ids", "id", "original_language", 
           "original_title", "overview", "popularity", "poster_path",
           "release_date", "title",	"video", "vote_average", "vote_count", "movie"]
no_data = pd.DataFrame(columns=no_data, index=range(1))

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=pd.errors.SettingWithCopyWarning)

df = {}

# for movie with many dataframes 
df2 = {}

for i, (idx, value) in enumerate(ds.iterrows()):
    movie = value['Name']
    year  = value['Year']
    
    url = f"https://api.themoviedb.org/3/search/movie?query={movie}&include_adult=false&language=en-US&primary_release_year={year}&page=1"
    response = requests.get(url, headers=headers)

    print(f"{i+1}/{ds.shape[0]}")

    if np.shape(response.json()['results'])[0] == 0:
        empty_df = no_data.copy()
        empty_df.loc[:, 'movie'] = movie
        df[movie] = empty_df

    else:
        result = pd.DataFrame(response.json()['results'])
        movie_data = result.loc[result['title'] == movie]
        if movie_data.empty:
            result.loc[:, 'movie'] = movie 
            df2[movie] = result.copy()
        else:
            movie_data.loc[:,'movie'] = movie
            df[movie] = movie_data.copy()

In [None]:
db = pd.concat([dat for _,dat in df.items()], ignore_index=True)
db2 = pd.concat([dat for _,dat in df2.items()], ignore_index=True)

In [None]:
no_data_movie = db.loc[db['adult'].isna(), 'movie'].to_list()

In [None]:
db.to_excel('TMDB.xlsx')
db2.to_excel('TMDB2.xlsx')

In [None]:
database = pd.read_excel('TMDB.xlsx')

In [None]:
new_list = {}

for m in database.loc[database['original_title'].isna(), 'movie']:
    if m == "A Brother and 7 Siblings":
        m = "1 Kakak 7 Ponakan"
    url = f"https://api.themoviedb.org/3/search/movie?query={m}&include_adult=false&language=en-US&page=1"
    response = requests.get(url, headers=headers)
    res = response.json()['results']
    new_list[m] = res
    if np.shape(res)[0] == 0:
        print(m)

In [None]:
new_db = pd.concat(
    [pd.DataFrame(dat) if isinstance(dat, list) else dat for dat in new_list.values()],
    ignore_index=True
)

In [None]:
new_db.to_excel("TMDB3.xlsx")

# Extract movie genre data

In [None]:
url = f"https://api.themoviedb.org/3/genre/movie/list?"
response = requests.get(url, headers=headers)

movie_genre = pd.DataFrame(response.json()['genres'])
movie_genre.to_excel('TMDB_genre.xlsx', index=False)

# After manual changes in TMDB data
Because each movie can have multiple entries due to the same name or multiple existing database, the data have to be manually cleaned in Excel

In [None]:
database = pd.read_excel("TMDB.xlsx")

## Edit the Genre data

In [None]:
genre = pd.read_excel("TMDB_genre.xlsx")

In [None]:
database['genre_ids'] = database['genre_ids'].str.replace(' ', '').str.split(',')

In [None]:
def movie_genre_apply(data):
    genre_list = []
    
    if type(data) == float:
        return " "
    else:
        for i in data:
            genre_list.append(genre.loc[genre["id"] == np.int64(i), 'name'])
        genre_list = np.array(genre_list).flatten()
        return genre_list

In [None]:
# database['genre'] = database['genre_ids'].apply(lambda x: movie_genre_apply(x)).apply(
#     lambda x: ", ".join(map(str, x))
# )

In [None]:
# check whether there is movie with multiple data
for t in np.sort(database['title'].unique().tolist()):
    dbt = database.loc[database['title'] == t]
    if dbt.shape[0] > 1:
        print(t)

In [None]:
database['id'] = database['id'].astype('Int64')

In [None]:
movie_id = database.loc[0, 'id']

## Extract detailed information for each movie

In [None]:
details = {}

In [None]:
for i, movie_id in  enumerate(database['id']):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"

    response = requests.get(url, headers=headers)
    results = response.json()
    details[movie_id] = pd.DataFrame([results])
    print(f"{i+1}/{database['id'].shape[0]}")

In [None]:
movie_details = pd.concat([dat for _, dat in details.items()], ignore_index=True)

In [None]:
movie_details.to_excel('TMDB_details.xlsx', index=False)

In [None]:
cols  = ['budget', 'homepage', 'id', 'imdb_id', 'origin_country', 'original_language', 
         'release_date', 'revenue', 'runtime', 'spoken_languages', 'tagline', 'title', ]

detail_db = movie_details[[col for col in movie_details if col in cols]].copy()

In [None]:
detail_db['origin_country'] =  detail_db['origin_country'].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

In [None]:
detail_db["spoken_languages"] = detail_db["spoken_languages"].apply(
    lambda x: ", ".join([d.get("english_name", "") for d in x]) if isinstance(x, list) else x
)

In [None]:
detail_db.to_excel('TMDB_Detail.xlsx', index=False)

## Extract cast and crew data

In [None]:
crew_dict = {}
cast_dict = {}

no_entries = {}

for i, (movie_id, movie_name) in database[['id', 'movie']].iterrows():

    print(f"{i+1}/{database['id'].shape[0]}")

    url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?language=en-US"


    response = requests.get(url, headers=headers)

    ex = pd.DataFrame([response.json()])
    if 'success' in ex.columns:
        continue

    else:
        open_cast = pd.DataFrame(ex['cast'][0])
        if np.shape(ex['cast'][0])[0] != 0:
            open_cast.loc[:, 'movie_id'] = movie_id 
            open_cast.loc[:, 'movie_name'] = movie_name 
            cast_dict[movie_id] = open_cast.copy()
        
        open_crew = pd.DataFrame(ex['crew'][0])
        if np.shape(ex['crew'][0])[0] != 0:
            open_crew.loc[:, 'movie'] = movie_name 
            open_crew.loc[:, 'movie_id'] = movie_id     
        
            crew_dict[movie_id] = open_crew.copy()
    
    

crew_dict = pd.concat([dat for _,dat in crew_dict.items()], ignore_index=True)
cast_dict = pd.concat([dat for _,dat in cast_dict.items()], ignore_index=True)

In [None]:
crew_dict.to_excel('TMDB_Crew_Raw.xlsx', index=False)
cast_dict.to_excel('TMDB_Cast_Raw.xlsx', index=False)

In [None]:
# Obtain only the main casts
def group_cast(movie_id):

    mask = (cast_dict['order'] <= 2) & (cast_dict['movie_id'] == movie_id)

    df = cast_dict.loc[mask]
    if df.shape[0] == 0:
        return None 
    else:
        return ", ".join(df['name'].tolist())

database['main cast'] = database['id'].apply(lambda x: group_cast(x))

In [None]:
# Extract information on main crew (director, composer, DoP)
def search_crew(x, job):
    mask = (crew_dict['job'] == job) & (crew_dict['movie_id'] == x)
    search = crew_dict.loc[mask, 'name'].values 
    num = search.shape[0]
    if num == 0:
        return None 
    elif num == 1:
        return search[0]
    else:
        return ", ".join(search)

In [None]:
database.loc[:,'Director'] = database['id'].apply(lambda x: search_crew(x, 'Director'))
database.loc[:, 'Composer'] = database['id'].apply(lambda x: search_crew(x, 'Original Music Composer'))
database.loc[:, 'DoP'] = database['id'].apply(lambda x: search_crew(x, 'Director of Photography'))

In [None]:
database.to_excel('TMDB.xlsx', index=False)

## Add movie order based on the rating

In [None]:
ds.loc[:,'Order'] = ds.groupby('Rating').cumcount() - \
    (ds.groupby('Rating')['Rating'].transform('size') // 2)

ds[['Name', 'Rating', 'Order']].copy().to_csv('ratings_order.csv', index=False)