In [1]:
import pandas as pd



In [3]:
input_file = "Imdb Movie Dataset.csv"
output_file = "movies_cleaned.csv"
df = pd.read_csv(input_file, parse_dates=['release_date'], dayfirst=False, infer_datetime_format=True)

# df.info()

# convert release_date to proper format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# drop rows w/ invalid or missing release_date
df = df.dropna(subset=['release_date'])

# filter movies from 1950s onwards
df = df[df['release_date'].dt.year >= 2000]

# remove movies that aren't released
df = df[df['status'] == "Released"]

# drop rows w/ missing data
required_columns = ['id', 'title', 'vote_average', 'vote_count', 'release_date', 'revenue', 'runtime', 'budget', 'imdb_id']
df = df.dropna(subset=required_columns)

# remove movies with 0 revenue, runtime, or budget
df = df[(df['revenue'] > 0) & (df['runtime'] > 0) & (df['budget'] > 0)]

# remove duplicates
df = df.drop_duplicates(subset=['id'])

# save cleaned csv file
df.to_csv(output_file, index=False)

print(f"Cleaning complete, saved as {output_file}")

df = pd.read_csv("movies_cleaned.csv")
print(f"Number of rows in cleaned dataset: {len(df)}")

Cleaning complete, saved as movies_cleaned.csv
Number of rows in cleaned dataset: 6937
