In [130]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [131]:
raw_omdb_df = pd.read_csv("./bases/omdb.csv")

In [132]:
raw_tmdb_df = pd.read_csv("./bases/tmdb.csv")

In [133]:
def is_valid_json(s):
    try:
        json.loads(s)
        return True
    except (json.JSONDecodeError, TypeError):
        return False

raw_omdb_df["is_valid_response"] = raw_omdb_df["response"].apply(is_valid_json)
raw_tmdb_df["is_valid_response"] = raw_tmdb_df["response"].apply(is_valid_json)

In [134]:
raw_omdb_df[~raw_omdb_df["is_valid_response"]]

Unnamed: 0,id,response,is_valid_response


In [135]:
raw_tmdb_df[~raw_tmdb_df["is_valid_response"]]

Unnamed: 0,id,imdb_id,response,is_valid_response


# Omdb setup dataframe

In [136]:
omdb_df = raw_omdb_df[raw_omdb_df["is_valid_response"] == True]

omdb_df["value"] = omdb_df.loc[:,"response"].apply(json.loads)

In [137]:
omdb_df = omdb_df.drop(["is_valid_response", "response"], axis=1)

In [138]:
omdb_df

Unnamed: 0,id,value
0,tt20114686,"{'Title': 'Femme', 'Year': '2023', 'Rated': 'N..."
1,tt30467885,"{'Title': 'Goebbels and the Führer', 'Year': '..."
2,tt28223926,"{'Title': 'Skincare', 'Year': '2024', 'Rated':..."
3,tt6318608,"{'Title': 'Nightwatch: Demons Are Forever', 'Y..."
4,tt6432466,"{'Title': 'Moxie', 'Year': '2021', 'Rated': 'P..."
...,...,...
3995,tt32064571,"{'Title': 'Nova & Alice', 'Year': '2024', 'Rat..."
3996,tt15600222,"{'Title': 'An Action Hero', 'Year': '2022', 'R..."
3997,tt8290476,"{'Title': 'Blank', 'Year': '2022', 'Rated': 'N..."
3998,tt26731970,"{'Title': 'All the Long Nights', 'Year': '2024..."


In [139]:
norm_omdb_df = pd.json_normalize(omdb_df["value"])

norm_omdb_df["id"] = norm_omdb_df["imdbID"]

In [140]:
final_omdb_df = pd.merge(omdb_df, norm_omdb_df, how="inner", on="id")

# TMDB Setup Dataframe

In [141]:
norm_tmdb_df

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/1YMrOtrW7b4pL2lfD8UciZPOJGs.jpg,,20000000,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",https://www.focusfeatures.com/conclave,974576,tt20215234,[US],en,...,Released,What happens behind these walls will change ev...,Conclave,False,7.211,2227,,,,
1,False,/hbFVmSYKvrqr9JuKVZ8l71kRQew.jpg,,20000000,"[{'id': 18, 'name': 'Drama'}]",https://a24films.com/films/babygirl,1097549,tt30057084,[US],en,...,Released,Get exactly what you want.,Babygirl,False,5.787,683,,,,
2,False,/87GU2ifjNYtgYtcRH1NNH1P4ODo.jpg,,6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",https://anora.film,1064213,tt28607951,[US],en,...,Released,Love is a hustle.,Anora,False,7.100,2103,,,,
3,False,/euYIwmwkmz95mnXvufEmbL6ovhZ.jpg,,310000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.gladiator.movie,558449,tt9218128,[US],en,...,Released,Prepare to be entertained.,Gladiator II,False,6.751,3275,1069584.0,Gladiator Collection,/bk6nx2rGNdlKtBsB9XcrclVKItv.jpg,/1VdLvSIeHuwqCT13H9EafxCacGB.jpg
4,False,/aMbKYfaexixvsBZKc5whYO7yibR.jpg,,20000000,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",http://theordermov.com,1082195,tt26625693,[US],en,...,Released,Based on the chilling true story.,The Order,False,6.573,587,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3945,False,/yUvVYkpOojoIEWn1HJlrU1nR5bU.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,1278076,tt32064571,[SE],sv,...,Released,One summer can change everything.,Nova & Alice,False,6.600,5,,,,
3946,False,/gmnX4h9BQrBZOVjzfr2c3eBRmx3.jpg,,0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,882826,tt15600222,[IN],hi,...,Released,,An Action Hero,False,6.500,61,,,,
3947,False,/y4IgPGAx5msaeeCXEtvfWdmmGxP.jpg,,275650,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,871891,tt8290476,[GB],en,...,Released,Watching. Helping. Guiding. Controlling.,Blank,False,4.300,25,,,,
3948,False,/9TZY0Jk2G2papFIWgQzY1QwZklo.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,1086011,tt26731970,[JP],ja,...,Released,,All the Long Nights,False,7.200,18,,,,


In [142]:
tmdb_df = raw_tmdb_df[raw_tmdb_df["is_valid_response"] == True]

tmdb_df["value"] = tmdb_df.loc[:,"response"].apply(json.loads)

tmdb_df = tmdb_df.drop(["is_valid_response", "response"], axis=1)

norm_tmdb_df = pd.json_normalize(tmdb_df["value"])

# norm_tmdb_df["id"] = norm_tmdb_df["imdb_id"]

final_tmdb_df = pd.merge(tmdb_df, norm_tmdb_df, how="inner", on="imdb_id")

In [148]:
final_tmdb_df["id"] = final_tmdb_df["imdb_id"]

# Setup complete DF

In [156]:
df = pd.merge(final_omdb_df, final_tmdb_df, how="inner", on="id")

In [161]:
df.columns

Index(['id', 'value_x', 'Title', 'Year', 'Rated', 'Released', 'Runtime',
       'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country',
       'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes',
       'imdbID', 'Type', 'DVD', 'BoxOffice', 'Production', 'Website',
       'Response', 'Season', 'Episode', 'seriesID', 'Error', 'id_x', 'imdb_id',
       'value_y', 'adult', 'backdrop_path', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'id_y', 'origin_country', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'video', 'vote_average', 'vote_count', 'belongs_to_collection.id',
       'belongs_to_collection.name', 'belongs_to_collection.poster_path',
       'belongs_to_collection.backdrop_path'],
      dtype='object')

In [183]:
# To fix how it store objects and lists
def is_complex(val):
    return isinstance(val, (list, dict))

def serialize_complex_columns(df):
    df_copy = df.copy()
    for col in df.columns:
        if df[col].apply(lambda x: is_complex(x)).any():
            df_copy[col] = df[col].apply(json.dumps)
    return df_copy

# Serialize and save
df_serialized = serialize_complex_columns(df)

In [184]:
df_serialized.to_csv("./bases/complete_db.csv")