In [1]:
import pandas as pd


# Load the raw data from CSV file
df = pd.read_csv('../../Extraction/csv/movies_tmdb_api_raw.csv')


# Drop duplicate rows
df = df.drop_duplicates()

# Drop rows with missing values in important columns
df = df.dropna(subset=['Movie/Show Name', 'Release Year'])

# Convert 'Release Year' to datetime
df['Release Year'] = pd.to_datetime(df['Release Year'], errors='coerce')

# Fill missing values in less important columns with a placeholder or appropriate value
df = df.fillna({'Genre(s)': 'Unknown', 'IMDb Rating': 0})

# Remove rows with invalid dates if any
df = df[df['Release Year'].notnull()]

# Standardize text columns
df['Movie/Show Name'] = df['Movie/Show Name'].str.strip()
df['Genre(s)'] = df['Genre(s)'].str.strip()

# Validate IMDb Rating is within 0-10
df = df[(df['IMDb Rating'] >= 0) & (df['IMDb Rating'] <= 10)]

# Validate Release Year is realistic
df = df[(df['Release Year'].dt.year >= 1800) & (df['Release Year'].dt.year <= 2025)]

# Fill missing Popularity Index
df = df.fillna({'Popularity Index (from API if available)': 0})

# Reset index after cleaning
df = df.reset_index(drop=True)

# Display cleaned dataframe
print(df.head())

# Export cleaned data to CSV
output_path = '../../Cleaned/Cleaned_CSV/movies_tmdb_cleaned.csv'
df.to_csv(output_path, index=False)
print(f"Cleaned data exported to {output_path}")

            Movie/Show Name             Genre(s)  IMDb Rating  \
0  The Shawshank Redemption         Drama, Crime        8.712   
1             The Godfather         Drama, Crime        8.700   
2     The Godfather Part II         Drama, Crime        8.600   
3          Schindler's List  Drama, History, War        8.567   
4              12 Angry Men                Drama        8.500   

   Number of Votes                  Release Year  Runtime (optional)  \
0            28987 1970-01-01 00:00:00.000001994                 142   
1            21913 1970-01-01 00:00:00.000001972                 175   
2            13241 1970-01-01 00:00:00.000001974                 202   
3            16753 1970-01-01 00:00:00.000001993                 195   
4             9446 1970-01-01 00:00:00.000001957                  97   

   Popularity Index (from API if available)  
0                                   28.1802  
1                                   26.8639  
2                                   14