In [1]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
raw_omdb_df = pd.read_csv("./bases/omdb.csv")

In [3]:
raw_tmdb_df = pd.read_csv("./bases/tmdb.csv")

In [4]:
def is_valid_json(s):
    try:
        json.loads(s)
        return True
    except (json.JSONDecodeError, TypeError):
        return False

raw_omdb_df["is_valid_response"] = raw_omdb_df["response"].apply(is_valid_json)
raw_tmdb_df["is_valid_response"] = raw_tmdb_df["response"].apply(is_valid_json)

In [5]:
raw_omdb_df[~raw_omdb_df["is_valid_response"]]

Unnamed: 0,id,response,is_valid_response


In [6]:
raw_tmdb_df[~raw_tmdb_df["is_valid_response"]]

Unnamed: 0,id,imdb_id,response,is_valid_response


# Omdb setup dataframe

In [7]:
omdb_df = raw_omdb_df[raw_omdb_df["is_valid_response"] == True]

omdb_df["value"] = omdb_df.loc[:,"response"].apply(json.loads)

In [8]:
omdb_df = omdb_df.drop(["is_valid_response", "response"], axis=1)

In [9]:
omdb_df

Unnamed: 0,id,value
0,tt20114686,"{'Title': 'Femme', 'Year': '2023', 'Rated': 'N..."
1,tt30467885,"{'Title': 'Goebbels and the Führer', 'Year': '..."
2,tt28223926,"{'Title': 'Skincare', 'Year': '2024', 'Rated':..."
3,tt6318608,"{'Title': 'Nightwatch: Demons Are Forever', 'Y..."
4,tt6432466,"{'Title': 'Moxie', 'Year': '2021', 'Rated': 'P..."
...,...,...
3995,tt32064571,"{'Title': 'Nova & Alice', 'Year': '2024', 'Rat..."
3996,tt15600222,"{'Title': 'An Action Hero', 'Year': '2022', 'R..."
3997,tt8290476,"{'Title': 'Blank', 'Year': '2022', 'Rated': 'N..."
3998,tt26731970,"{'Title': 'All the Long Nights', 'Year': '2024..."


In [10]:
norm_omdb_df = pd.json_normalize(omdb_df["value"])

norm_omdb_df["id"] = norm_omdb_df["imdbID"]

In [11]:
final_omdb_df = pd.merge(omdb_df, norm_omdb_df, how="inner", on="id")

# TMDB Setup Dataframe

In [14]:
tmdb_df = raw_tmdb_df[raw_tmdb_df["is_valid_response"] == True]

tmdb_df["value"] = tmdb_df.loc[:,"response"].apply(json.loads)

tmdb_df = tmdb_df.drop(["is_valid_response", "response"], axis=1)

norm_tmdb_df = pd.json_normalize(tmdb_df["value"])

# norm_tmdb_df["id"] = norm_tmdb_df["imdb_id"]

final_tmdb_df = pd.merge(tmdb_df, norm_tmdb_df, how="inner", on="imdb_id")

In [15]:
final_tmdb_df["id"] = final_tmdb_df["imdb_id"]

# Setup complete DF

In [16]:
df = pd.merge(final_omdb_df, final_tmdb_df, how="inner", on="id")

In [17]:
df.columns

Index(['id', 'value_x', 'Title', 'Year', 'Rated', 'Released', 'Runtime',
       'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country',
       'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes',
       'imdbID', 'Type', 'DVD', 'BoxOffice', 'Production', 'Website',
       'Response', 'Season', 'Episode', 'seriesID', 'Error', 'id_x', 'imdb_id',
       'value_y', 'adult', 'backdrop_path', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'id_y', 'origin_country', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'video', 'vote_average', 'vote_count', 'belongs_to_collection.id',
       'belongs_to_collection.name', 'belongs_to_collection.poster_path',
       'belongs_to_collection.backdrop_path'],
      dtype='object')

In [18]:
# To fix how it store objects and lists
def is_complex(val):
    return isinstance(val, (list, dict))

def serialize_complex_columns(df):
    df_copy = df.copy()
    for col in df.columns:
        if df[col].apply(lambda x: is_complex(x)).any():
            df_copy[col] = df[col].apply(json.dumps)
    return df_copy

# Serialize and save
df_serialized = serialize_complex_columns(df)

In [19]:
#df_serialized.to_csv("./bases/complete_db.csv")

df_serialized

Unnamed: 0,id,value_x,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,tt20114686,"{""Title"": ""Femme"", ""Year"": ""2023"", ""Rated"": ""N...",Femme,2023,,05 Apr 2024,99 min,"Drama, Thriller","Sam H. Freeman, Ng Choon Ping","Sam H. Freeman, Ng Choon Ping",...,Released,Seduction is revenge.,Femme,False,6.700,100,,,,
1,tt30467885,"{""Title"": ""Goebbels and the F\u00fchrer"", ""Yea...",Goebbels and the Führer,2024,,12 Sep 2024,135 min,"Biography, Drama, History",Joachim Lang,Joachim Lang,...,Released,,Führer and Seducer,False,6.700,53,,,,
2,tt28223926,"{""Title"": ""Skincare"", ""Year"": ""2024"", ""Rated"":...",Skincare,2024,R,16 Aug 2024,96 min,"Comedy, Horror, Mystery",Austin Peters,"Sam Freilich, Deering Regan, Austin Peters",...,Released,It's just a little cover-up.,Skincare,False,5.939,74,,,,
3,tt6318608,"{""Title"": ""Nightwatch: Demons Are Forever"", ""Y...",Nightwatch: Demons Are Forever,2023,Not Rated,17 May 2024,118 min,"Horror, Mystery, Thriller",Ole Bornedal,Ole Bornedal,...,Released,,Nightwatch: Demons Are Forever,False,6.056,62,1172371.0,Nightwatch Collection,/uOXeb4vKZLTROhKAewzUrP3dfzh.jpg,
4,tt6432466,"{""Title"": ""Moxie"", ""Year"": ""2021"", ""Rated"": ""P...",Moxie,2021,PG-13,03 Mar 2021,111 min,"Comedy, Drama",Amy Poehler,"Jennifer Mathieu, Tamara Chestna, Dylan Meyer",...,Released,Find your voice.,Moxie,False,7.300,807,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3945,tt32064571,"{""Title"": ""Nova & Alice"", ""Year"": ""2024"", ""Rat...",Nova & Alice,2024,,13 Sep 2024,110 min,"Drama, Music",Emma Bucht,"Jonna Bolin-Cullberg, Bengt Braskered",...,Released,One summer can change everything.,Nova & Alice,False,6.600,5,,,,
3946,tt15600222,"{""Title"": ""An Action Hero"", ""Year"": ""2022"", ""R...",An Action Hero,2022,,02 Dec 2022,130 min,"Action, Comedy, Crime",Anirudh Iyer,"Anirudh Iyer, Neeraj Yadav",...,Released,,An Action Hero,False,6.500,61,,,,
3947,tt8290476,"{""Title"": ""Blank"", ""Year"": ""2022"", ""Rated"": ""N...",Blank,2022,,31 Jan 2023,94 min,"Drama, Sci-Fi, Thriller",Natalie Kennedy,Stephen Herman,...,Released,Watching. Helping. Guiding. Controlling.,Blank,False,4.300,25,,,,
3948,tt26731970,"{""Title"": ""All the Long Nights"", ""Year"": ""2024...",All the Long Nights,2024,,09 Feb 2024,119 min,Drama,Shô Miyake,"Shô Miyake, Maiko Seo, Kiyohito Wada",...,Released,,All the Long Nights,False,7.200,18,,,,
