# exploring movie data csv

In [47]:
import pandas as pd
import numpy as np

In [48]:
movies = pd.read_csv('../data/raw/ml-32m/movies.csv')
print(f"Total movies: {len(movies)}")
movies.head()

Total movies: 87585


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [49]:
print("shape:", movies.shape)
print("\ncols:", movies.columns.tolist())
print("\ndata types:")
print(movies.dtypes)

shape: (87585, 3)

cols: ['movieId', 'title', 'genres']

data types:
movieId     int64
title      object
genres     object
dtype: object


In [50]:
for i in range(5):
    print(f"{movies.iloc[i]['title']}: {movies.iloc[i]['genres']}")

Toy Story (1995): Adventure|Animation|Children|Comedy|Fantasy
Jumanji (1995): Adventure|Children|Fantasy
Grumpier Old Men (1995): Comedy|Romance
Waiting to Exhale (1995): Comedy|Drama|Romance
Father of the Bride Part II (1995): Comedy


In [51]:
all_genres = []
for genres in movies['genres']:
    if genres != '(no genres listed)':
        all_genres.extend(genres.split('|'))

genre_counts = pd.Series(all_genres).value_counts()
print("top 20 genres:")
print(genre_counts.head(20))

top 20 genres:
Drama          34175
Comedy         23124
Thriller       11823
Romance        10369
Action          9668
Documentary     9363
Horror          8654
Crime           6976
Adventure       5402
Sci-Fi          4907
Animation       4617
Children        4520
Mystery         4013
Fantasy         3851
War             2325
Western         1696
Musical         1059
Film-Noir        353
IMAX             195
Name: count, dtype: int64


In [52]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)$')[0]
movies['year'] = pd.to_numeric(movies['year'], errors='coerce')

print(f"year range: {movies['year'].min()} - {movies['year'].max()}")

year range: 1874.0 - 2023.0


In [53]:
sample_decades = [1990, 2000, 2010, 2020]
for decade in sample_decades:
    decade_movies = movies[(movies['year'] >= decade) & (movies['year'] < decade + 10)]
    if len(decade_movies) > 0:
        print(f"\n{decade}s movies (sample):")
        print(decade_movies[['title', 'genres', 'year']].head(3).to_string(index=False))


1990s movies (sample):
                  title                                      genres   year
       Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1995.0
         Jumanji (1995)                  Adventure|Children|Fantasy 1995.0
Grumpier Old Men (1995)                              Comedy|Romance 1995.0

2000s movies (sample):
             title                    genres   year
 Yards, The (2000)               Crime|Drama 2000.0
Next Friday (2000)                    Comedy 2000.0
  Supernova (2000) Adventure|Sci-Fi|Thriller 2000.0

2010s movies (sample):
                  title                       genres   year
     Daybreakers (2010) Action|Drama|Horror|Thriller 2010.0
       Leap Year (2010)               Comedy|Romance 2010.0
Book of Eli, The (2010)       Action|Adventure|Drama 2010.0

2020s movies (sample):
                 title                  genres   year
          Waves (2020)           Drama|Romance 2020.0
Gretel & Hansel (2020) Fantasy|Horror|Thriller 2