In [230]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [231]:
raw_tmdb_df = pd.read_csv("./bases/tmdb.csv")

In [232]:
new_raw_tmdb_df = pd.read_csv("./bases/new/final_extracted_tmdb.csv")

In [233]:
def is_valid_json(s):
    try:
        json.loads(s)
        return True
    except (json.JSONDecodeError, TypeError):
        return False

raw_tmdb_df["is_valid_response"] = raw_tmdb_df["response"].apply(is_valid_json)
new_raw_tmdb_df["is_valid_response"] = new_raw_tmdb_df["response"].apply(is_valid_json)

In [234]:
raw_tmdb_df[~raw_tmdb_df["is_valid_response"]]

Unnamed: 0,id,imdb_id,response,is_valid_response


In [235]:
new_raw_tmdb_df[~new_raw_tmdb_df["is_valid_response"]]

Unnamed: 0,id,imdb_id,response,is_valid_response


In [236]:
new_raw_tmdb_df.shape

(6510, 4)

In [237]:
raw_tmdb_df.shape

(3950, 4)

In [238]:
old_raw_tmdb_df = raw_tmdb_df[~raw_tmdb_df['imdb_id'].isin(new_raw_tmdb_df['imdb_id'])]

In [239]:
old_raw_tmdb_df.shape

(3177, 4)

In [240]:
tmdb_df = pd.concat([old_raw_tmdb_df, new_raw_tmdb_df])

In [241]:
tmdb_df.shape == tmdb_df[tmdb_df["is_valid_response"] == True].shape

True

In [242]:
tmdb_df["value"] = tmdb_df.loc[:,"response"].apply(json.loads)

tmdb_df = tmdb_df.drop(["is_valid_response", "response"], axis=1)

norm_tmdb_df = pd.json_normalize(tmdb_df["value"])

# norm_tmdb_df["id"] = norm_tmdb_df["imdb_id"]

final_tmdb_df = pd.merge(tmdb_df, norm_tmdb_df, how="inner", on="imdb_id")

In [243]:
final_tmdb_df

Unnamed: 0,id_x,imdb_id,value,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id_y,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,974576,tt20215234,"{'adult': False, 'backdrop_path': '/1YMrOtrW7b...",False,/1YMrOtrW7b4pL2lfD8UciZPOJGs.jpg,,20000000,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",https://www.focusfeatures.com/conclave,974576,...,Released,What happens behind these walls will change ev...,Conclave,False,7.211,2227,,,,
1,1097549,tt30057084,"{'adult': False, 'backdrop_path': '/hbFVmSYKvr...",False,/hbFVmSYKvrqr9JuKVZ8l71kRQew.jpg,,20000000,"[{'id': 18, 'name': 'Drama'}]",https://a24films.com/films/babygirl,1097549,...,Released,Get exactly what you want.,Babygirl,False,5.787,683,,,,
2,1064213,tt28607951,"{'adult': False, 'backdrop_path': '/87GU2ifjNY...",False,/87GU2ifjNYtgYtcRH1NNH1P4ODo.jpg,,6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",https://anora.film,1064213,...,Released,Love is a hustle.,Anora,False,7.100,2103,,,,
3,558449,tt9218128,"{'adult': False, 'backdrop_path': '/euYIwmwkmz...",False,/euYIwmwkmz95mnXvufEmbL6ovhZ.jpg,,310000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.gladiator.movie,558449,...,Released,Prepare to be entertained.,Gladiator II,False,6.751,3275,1069584.0,Gladiator Collection,/bk6nx2rGNdlKtBsB9XcrclVKItv.jpg,/1VdLvSIeHuwqCT13H9EafxCacGB.jpg
4,1082195,tt26625693,"{'adult': False, 'backdrop_path': '/aMbKYfaexi...",False,/aMbKYfaexixvsBZKc5whYO7yibR.jpg,,20000000,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",http://theordermov.com,1082195,...,Released,Based on the chilling true story.,The Order,False,6.573,587,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9682,998999,tt13260630,"{'adult': False, 'backdrop_path': '/aKUuZDGejA...",False,/aKUuZDGejALdxwLwDndwSyQQRIY.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,998999,...,Released,Miracles happen when you let go of the reins.,Adeline,False,6.800,8,,,,
9683,999142,tt33298252,"{'adult': False, 'backdrop_path': '/tZ7ki6JMgE...",False,/tZ7ki6JMgEliY4u4XJmABZfAT2g.jpg,,2900000,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,999142,...,Released,,Rabia,False,6.471,17,,,,
9684,999205,tt8889818,"{'adult': False, 'backdrop_path': '/tviTAFuCFR...",False,/tviTAFuCFRLElylvBAMPjdRnCDZ.jpg,,0,"[{'id': 28, 'name': 'Action'}, {'id': 9648, 'n...",,999205,...,Released,,Stowaway,False,6.298,84,,,,
9685,999278,tt15307448,"{'adult': False, 'backdrop_path': '/sBk6nzxhtT...",False,/sBk6nzxhtTkrn9JZj32vaRki0QG.jpg,,250000,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",https://www.almostfamousproduction.it/,999278,...,Released,,The Goldsmith,False,5.966,87,,,,


# OMDB

In [244]:
new_raw_omdb_df = pd.read_csv("./bases/new/final_extracted_omdb.csv")

In [245]:
def is_valid_json(s):
    try:
        json.loads(s)
        return True
    except (json.JSONDecodeError, TypeError):
        return False

raw_omdb_df["is_valid_response"] = raw_omdb_df["response"].apply(is_valid_json)
new_raw_omdb_df["is_valid_response"] = new_raw_omdb_df["response"].apply(is_valid_json)

In [246]:
old_raw_omdb_df = raw_omdb_df[~raw_omdb_df['id'].isin(new_raw_omdb_df['id'])]

In [247]:
omdb_df = pd.concat([old_raw_omdb_df, new_raw_omdb_df])

omdb_df["value"] = omdb_df.loc[:,"response"].apply(json.loads)


In [248]:
omdb_df

Unnamed: 0,id,response,is_valid_response,value
0,tt20114686,"{""Title"": ""Femme"", ""Year"": ""2023"", ""Rated"": ""N...",True,"{'Title': 'Femme', 'Year': '2023', 'Rated': 'N..."
1,tt30467885,"{""Title"": ""Goebbels and the F\u00fchrer"", ""Yea...",True,"{'Title': 'Goebbels and the Führer', 'Year': '..."
2,tt28223926,"{""Title"": ""Skincare"", ""Year"": ""2024"", ""Rated"":...",True,"{'Title': 'Skincare', 'Year': '2024', 'Rated':..."
3,tt6318608,"{""Title"": ""Nightwatch: Demons Are Forever"", ""Y...",True,"{'Title': 'Nightwatch: Demons Are Forever', 'Y..."
4,tt6432466,"{""Title"": ""Moxie"", ""Year"": ""2021"", ""Rated"": ""P...",True,"{'Title': 'Moxie', 'Year': '2021', 'Rated': 'P..."
...,...,...,...,...
6589,tt9887520,"{""Title"": ""Season of Love"", ""Year"": ""2019"", ""R...",True,"{'Title': 'Season of Love', 'Year': '2019', 'R..."
6590,tt9894470,"{""Title"": ""VFW"", ""Year"": ""2019"", ""Rated"": ""Not...",True,"{'Title': 'VFW', 'Year': '2019', 'Rated': 'Not..."
6591,tt9896916,"{""Title"": ""Pilgrim's Progress"", ""Year"": ""2019""...",True,"{'Title': 'Pilgrim's Progress', 'Year': '2019'..."
6592,tt9900782,"{""Title"": ""Kaithi"", ""Year"": ""2019"", ""Rated"": ""...",True,"{'Title': 'Kaithi', 'Year': '2019', 'Rated': '..."


In [249]:
norm_omdb_df = pd.json_normalize(omdb_df["value"])

norm_omdb_df["id"] = norm_omdb_df["imdbID"]

In [250]:
omdb_df = omdb_df.drop(["is_valid_response", "response"], axis=1)

In [251]:
final_omdb_df = pd.merge(omdb_df, norm_omdb_df, how="inner", on="id")

In [252]:
final_omdb_df

Unnamed: 0,id,value,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,...,Type,DVD,BoxOffice,Production,Website,Response,Error,Season,Episode,seriesID
0,tt20114686,"{'Title': 'Femme', 'Year': '2023', 'Rated': 'N...",Femme,2023,,05 Apr 2024,99 min,"Drama, Thriller","Sam H. Freeman, Ng Choon Ping","Sam H. Freeman, Ng Choon Ping",...,movie,,"$187,053",,,True,,,,
1,tt30467885,"{'Title': 'Goebbels and the Führer', 'Year': '...",Goebbels and the Führer,2024,,12 Sep 2024,135 min,"Biography, Drama, History",Joachim Lang,Joachim Lang,...,movie,,,,,True,,,,
2,tt28223926,"{'Title': 'Skincare', 'Year': '2024', 'Rated':...",Skincare,2024,R,16 Aug 2024,96 min,"Comedy, Horror, Mystery",Austin Peters,"Sam Freilich, Deering Regan, Austin Peters",...,movie,,"$456,590",,,True,,,,
3,tt6318608,"{'Title': 'Nightwatch: Demons Are Forever', 'Y...",Nightwatch: Demons Are Forever,2023,Not Rated,17 May 2024,118 min,"Horror, Mystery, Thriller",Ole Bornedal,Ole Bornedal,...,movie,,,,,True,,,,
4,tt6432466,"{'Title': 'Moxie', 'Year': '2021', 'Rated': 'P...",Moxie,2021,PG-13,03 Mar 2021,111 min,"Comedy, Drama",Amy Poehler,"Jennifer Mathieu, Tamara Chestna, Dylan Meyer",...,movie,,,,,True,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9791,tt9887520,"{'Title': 'Season of Love', 'Year': '2019', 'R...",Season of Love,2019,,30 Nov 2019,105 min,"Comedy, Romance",Christin Baker,Kathryn Trammell,...,movie,,,,,True,,,,
9792,tt9894470,"{'Title': 'VFW', 'Year': '2019', 'Rated': 'Not...",VFW,2019,Not Rated,14 Feb 2020,92 min,"Action, Crime, Horror",Joe Begos,"Max Brallier, Matthew McArdle",...,movie,,,,,True,,,,
9793,tt9896916,"{'Title': 'Pilgrim's Progress', 'Year': '2019'...",Pilgrim's Progress,2019,PG,18 Apr 2019,108 min,"Animation, Adventure, Family",Robert Fernandez,"John Bunyan, Robert Fernandez",...,movie,,"$1,294,596",,,True,,,,
9794,tt9900782,"{'Title': 'Kaithi', 'Year': '2019', 'Rated': '...",Kaithi,2019,Not Rated,24 Oct 2019,145 min,"Action, Crime, Drama",Lokesh Kanagaraj,"Lokesh Kanagaraj, Pon Parthiban, Sanjeev Tiwari",...,movie,,,,,True,,,,


# SETUP COMPLETE

In [253]:
# df = pd.merge(final_omdb_df, final_tmdb_df, how="inner", on="id")
final_tmdb_df['id'] = final_tmdb_df['imdb_id']

In [254]:
final_omdb_df['id']

0       tt20114686
1       tt30467885
2       tt28223926
3        tt6318608
4        tt6432466
           ...    
9791     tt9887520
9792     tt9894470
9793     tt9896916
9794     tt9900782
9795     tt9914942
Name: id, Length: 9796, dtype: object

In [255]:
df = pd.merge(final_omdb_df, final_tmdb_df, how="inner", on="id")

In [257]:
# To fix how it store objects and lists
def is_complex(val):
    return isinstance(val, (list, dict))

def serialize_complex_columns(df):
    df_copy = df.copy()
    for col in df.columns:
        if df[col].apply(lambda x: is_complex(x)).any():
            df_copy[col] = df[col].apply(json.dumps)
    return df_copy

# Serialize and save
df_serialized = serialize_complex_columns(df)

In [258]:
#df_serialized.to_csv('./bases/new_complete/complete_raw.csv')