## Data Cleaning 

1. Removing unwanted column from the csv file 

In [3]:
import pandas as pd

# Load the dataset
input_file = "tmdb_all_movies.csv"  
output_file = "tmdb_cleaned_movies.csv"

# Columns to keep
required_columns = [
    "id", "title", "vote_average", "vote_count", "release_date", "revenue", 
    "runtime", "budget", "original_language", "overview", "popularity", 
    "genres", "cast", "director"
]

# Read the dataset
df = pd.read_csv(input_file)

# Select only the required columns
df_cleaned = df[required_columns]

# Save the cleaned dataset
df_cleaned.to_csv(output_file, index=False)

print(f"Cleaned dataset saved to {output_file}")

Cleaned dataset saved to tmdb_cleaned_movies.csv


2. Removing the null value column in the dataframe  

In [12]:
df_cleaned = df_cleaned.dropna(axis=1, how="all")
print(df_cleaned.head())

   id                             title  vote_average  vote_count  \
0   2                             Ariel         7.082         304   
1   3               Shadows in Paradise         7.302         344   
2   5                        Four Rooms         5.800        2534   
3   6                    Judgment Night         6.527         316   
4   8  Life in Loops (A Megacities RMX)         7.500          27   

  release_date     revenue  runtime      budget original_language  \
0   1988-10-21         0.0       73         0.0                fi   
1   1986-10-17         0.0       74         0.0                fi   
2   1995-12-09   4257354.0       98   4000000.0                en   
3   1993-10-15  12136938.0      109  21000000.0                en   
4   2006-01-01         0.0       80     42000.0                en   

                                            overview  popularity  \
0  After the coal mine he works at closes and his...       9.751   
1  Nikander, a rubbish collector a

3. Enriching data by creating a year column, month column and day column

In [14]:
df_cleaned['release_date'] = pd.to_datetime(df_cleaned['release_date'], errors='coerce')

# Extract year, month, and day into new columns
df_cleaned['year'] = df_cleaned['release_date'].dt.year
df_cleaned['month'] = df_cleaned['release_date'].dt.month
df_cleaned['day'] = df_cleaned['release_date'].dt.day

# Display the updated DataFrame
print(df_cleaned[['release_date', 'year', 'month', 'day']].head())

  release_date    year  month   day
0   1988-10-21  1988.0   10.0  21.0
1   1986-10-17  1986.0   10.0  17.0
2   1995-12-09  1995.0   12.0   9.0
3   1993-10-15  1993.0   10.0  15.0
4   2006-01-01  2006.0    1.0   1.0


In [None]:
output_file = "tmdb_cleaned_with_date_columns.csv"
df_cleaned.to_csv(output_file, index=False)

print(f"DataFrame saved to {output_file}")