In [21]:
import pandas as pd
import numpy as np
import json

In [22]:
df = pd.read_csv('../data/movies_2024-09-10.csv', keep_default_na=False)

In [23]:
drop_col = ['recommendations', 'backdrop_path', 'poster_path']
df.drop(drop_col, axis=1, inplace=True)

In [24]:
# df.release_date.replace('', '1970-01-01', inplace=True)

In [25]:
df.dtypes

id                        int64
title                    object
genres                   object
original_language        object
overview                 object
popularity              float64
production_companies     object
release_date             object
budget                  float64
revenue                 float64
runtime                  object
status                   object
tagline                  object
vote_average            float64
vote_count              float64
credits                  object
keywords                 object
dtype: object

In [26]:
df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce')
df['runtime'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['runtime'].fillna(0, inplace=True)


In [27]:
int_cols = ['budget', 'revenue', 'runtime', 'vote_count']
df[int_cols] = df[int_cols].astype(int)
df[int_cols].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[int_cols].fillna(0, inplace=True)


In [28]:
with open('../data/language_dict.json', 'r') as f:
    language_dict = json.load(f)

df['original_language'].replace(language_dict, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['original_language'].replace(language_dict, inplace=True)


In [29]:
df.sample(5)

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords
379435,738912,Documentary film of dash akol,,Persian,It is the first work of Kamran Heydari that ex...,0.6,,,0,0,19,Released,,0.0,0,,
409712,928222,The Book Peddler,,French,Hervé runs the Librairie Aléatoire in Amiens. ...,0.6,,2022-01-13,0,0,52,Released,,0.0,0,Hervé Jézéquel,
118370,120881,Documenting the Grey Man,Horror,English,A ghost hunting expedition goes horribly wrong...,1.648,,2011-01-31,0,0,73,Released,,2.0,4,Patrick Hussion-Kelly Coulter-Jillian Walzer-W...,
209725,427370,A Story Worth Living,Documentary,English,Six novice riders-father sons and friends-take...,1.098,,2016-10-25,0,0,84,Released,,0.0,0,Jon Dale-Dan B. Allender-Blaine Eldredge-John ...,motorcycle-journey
112164,431111,King's Gambit,Fantasy-Drama-Action-Adventure,English,King's Gambit is an independent live-action fe...,1.748,,2018-01-01,0,3,0,Released,A new herald arises,10.0,1,Blake Webb-Rebecca Galarza-Reggie Peters-Wade ...,


In [30]:
df.to_csv('../data/movies_clean.csv', index=False)