In [1]:
import pandas as pd
import random
import os

In [None]:
# Load the dataset
df = pd.read_csv('/home/vrinda/Documents/Courses/CS520/Dataset/Movies/TMDB_all_movies.csv')

# Display basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1023264 entries, 0 to 1023263
Data columns (total 28 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   id                       1023264 non-null  int64  
 1   title                    1023252 non-null  object 
 2   vote_average             1023262 non-null  float64
 3   vote_count               1023262 non-null  float64
 4   status                   1023262 non-null  object 
 5   release_date             909452 non-null   object 
 6   revenue                  1023262 non-null  float64
 7   runtime                  1023262 non-null  float64
 8   budget                   1023262 non-null  float64
 9   imdb_id                  592840 non-null   object 
 10  original_language        1023262 non-null  object 
 11  original_title           1023252 non-null  object 
 12  overview                 842547 non-null   object 
 13  popularity               1023262 non-null 

In [3]:
# Check for null values
null_values = df.isnull().sum()
print("\nNull values in each column:")
print(null_values)


Null values in each column:
id                              0
title                          12
vote_average                    2
vote_count                      2
status                          2
release_date               113812
revenue                         2
runtime                         2
budget                          2
imdb_id                    430424
original_language               2
original_title                 12
overview                   180717
popularity                      2
tagline                    871138
genres                     296663
production_companies       550743
production_countries       404315
spoken_languages           392590
cast                       340080
director                   184495
director_of_photography    777381
writers                    527274
producers                  697366
music_composer             923444
imdb_rating                592495
imdb_votes                 592495
poster_path                299033
dtype: int64


In [56]:
# Calculate percentage of missing values
missing_percentage = (null_values / len(df)) * 100
print("\nPercentage of missing values:")
print(missing_percentage)


Percentage of missing values:
id                          0.000000
title                       0.001173
vote_average                0.000195
vote_count                  0.000195
status                      0.000195
release_date               11.122447
revenue                     0.000195
runtime                     0.000195
budget                      0.000195
imdb_id                    42.063827
original_language           0.000195
original_title              0.001173
overview                   17.660838
popularity                  0.000195
tagline                    85.133260
genres                     28.991834
production_companies       53.822181
production_countries       39.512286
spoken_languages           38.366443
cast                       33.234825
director                   18.030049
director_of_photography    75.970717
writers                    51.528638
producers                  68.151132
music_composer             90.244942
imdb_rating                57.902457
imdb_vo

In [57]:
# Identify columns with missing values
columns_with_missing = missing_percentage[missing_percentage > 0].index.tolist()
print("\nColumns with missing values:", columns_with_missing)


Columns with missing values: ['title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'budget', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'cast', 'director', 'director_of_photography', 'writers', 'producers', 'music_composer', 'imdb_rating', 'imdb_votes', 'poster_path']


In [43]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")


Number of duplicate rows: 0


In [None]:
# Check data types of columns
print("\nData types of columns:")
print(df.dtypes)


Data types of columns:
id                           int64
title                       object
vote_average               float64
vote_count                 float64
status                      object
release_date                object
revenue                    float64
runtime                    float64
budget                     float64
imdb_id                     object
original_language           object
original_title              object
overview                    object
popularity                 float64
tagline                     object
genres                      object
production_companies        object
production_countries        object
spoken_languages            object
cast                        object
director                    object
director_of_photography     object
writers                     object
producers                   object
music_composer              object
imdb_rating                float64
imdb_votes                 float64
poster_path                 obj

In [58]:
# Check for any completely empty columns
empty_columns = df.columns[df.isnull().all()].tolist()
print("\nCompletely empty columns:", empty_columns)


Completely empty columns: []


In [4]:
# List of columns to keep
columns_to_keep = ['id', 'title', 'release_date', 'original_language', 'genres', 'cast', 'director','poster_path']
df_cleaned = df[columns_to_keep]

In [7]:
print(df_cleaned.dtypes)

id                    int64
title                object
release_date         object
original_language    object
genres               object
cast                 object
director             object
poster_path          object
dtype: object


In [60]:
# Verifying the remaining columns
print(df_cleaned.columns)
print(df_cleaned.isnull().sum())

Index(['id', 'title', 'release_date', 'original_language', 'genres', 'cast',
       'director', 'poster_path'],
      dtype='object')
id                        0
title                    12
release_date         113812
original_language         2
genres               296663
cast                 340080
director             184495
poster_path          299033
dtype: int64


In [9]:
# Drop all rows with null values
df_null_cleaned = df_cleaned.dropna()
print(f"Number of rows after dropping null values: {len(df_null_cleaned)}")

# Display the number of null values in each column after dropping
print("\nNull values in each column after dropping:")
print(df_null_cleaned.isnull().sum())

Number of rows after dropping null values: 423838

Null values in each column after dropping:
id                   0
title                0
release_date         0
original_language    0
genres               0
cast                 0
director             0
poster_path          0
dtype: int64


In [None]:
og_path = "/home/vrinda/Documents/Courses/CS520/Project/Cleaned Dataset/movies_large.csv"

In [None]:
current_size_mb = os.path.getsize(og_path) / (1024 * 1024)
ratio_to_keep = 60 / current_size_mb

113.1352367401123
0.5303387497020802


In [None]:
initial_rows = len(df_null_cleaned)
target_rows = int(initial_rows * ratio_to_keep)

423838 224777


In [72]:
rows_to_keep = random.sample(range(initial_rows), target_rows)

# Create a new dataframe with the selected rows
df_reduced = df_null_cleaned.iloc[rows_to_keep]

In [None]:
output_file = "/home/vrinda/Documents/Courses/CS520/Project/Cleaned Dataset/movies.csv"
# Save the reduced dataframe to a new CSV file
# df_reduced.to_csv(output_file, index=False)