In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import json_normalize
import ast

In [3]:
movies_df = pd.read_csv("ETL_movies_main.csv")

In [4]:
movies_df.shape

(45346, 20)

In [5]:
# Dropping duplicates, just in case
movies_df = movies_df.drop_duplicates(keep="first")

In [7]:
movies_df.shape

(45346, 20)

In [8]:
df_genres = movies_df.loc[:,["Movie_id","genres"]]
df_genres

Unnamed: 0,Movie_id,genres
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,"[{'id': 35, 'name': 'Comedy'}]"
...,...,...
45341,30840,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name..."
45342,111109,"[{'id': 18, 'name': 'Drama'}]"
45343,67758,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam..."
45344,227506,[]


In [9]:
#set id/Movie_id as index
df_genres.set_index('Movie_id',inplace=True)

In [10]:
# set str to dict
df_genres= df_genres['genres'].apply(ast.literal_eval) 

In [11]:
# set Movie_id as a columns again and check
df_genres = df_genres.reset_index()
df_genres.head()

Unnamed: 0,Movie_id,genres
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,"[{'id': 35, 'name': 'Comedy'}]"


In [12]:
# Drop rows with empty lists or other non-list values
df = df_genres[df_genres['genres'].apply(lambda x: isinstance(x, list) and len(x) > 0)]

print(df)

       Movie_id                                             genres
0           862  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1          8844  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2         15602  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3         31357  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4         11862                     [{'id': 35, 'name': 'Comedy'}]
...         ...                                                ...
45339    289923                     [{'id': 27, 'name': 'Horror'}]
45340    222848           [{'id': 878, 'name': 'Science Fiction'}]
45341     30840  [{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...
45342    111109                      [{'id': 18, 'name': 'Drama'}]
45343     67758  [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...

[42962 rows x 2 columns]


In [13]:
# name columna id to Movie_id
df_genres = df
df_genres

Unnamed: 0,Movie_id,genres
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,"[{'id': 35, 'name': 'Comedy'}]"
...,...,...
45339,289923,"[{'id': 27, 'name': 'Horror'}]"
45340,222848,"[{'id': 878, 'name': 'Science Fiction'}]"
45341,30840,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name..."
45342,111109,"[{'id': 18, 'name': 'Drama'}]"


In [14]:
# Explode the 'genres' list of dictionaries to separate rows
df_exploded = df_genres.explode('genres', ignore_index=True)

# Create separate columns 'id' and 'name' using DataFrame constructor
df_exploded[['id', 'name']] = pd.DataFrame(df_exploded['genres'].tolist())

# Drop the original 'genres' column
df_exploded.drop(columns=['genres'], inplace=True)

print(df_exploded)

       Movie_id     id       name
0           862     16  Animation
1           862     35     Comedy
2           862  10751     Family
3          8844     12  Adventure
4          8844     14    Fantasy
...         ...    ...        ...
90952     30840  10749    Romance
90953    111109     18      Drama
90954     67758     28     Action
90955     67758     18      Drama
90956     67758     53   Thriller

[90957 rows x 3 columns]


In [15]:
df_genres = df_exploded
df_genres = df_genres.rename(columns={"name":"genero"})
df_genres.head()

Unnamed: 0,Movie_id,id,genero
0,862,16,Animation
1,862,35,Comedy
2,862,10751,Family
3,8844,12,Adventure
4,8844,14,Fantasy


In [16]:
# genres to csv

In [17]:
df_genres.to_csv("ETL_movies_genres.csv",index =False)

df_genres para ML, necesito los generos en una misma fila

In [23]:
# Group by 'Movie_id' and join 'genero' values
df_grouped = df_genres.groupby('Movie_id')['genero'].apply(' '.join).reset_index()
df_genres_grouped = pd.DataFrame(df_grouped)
df_genres_grouped

Unnamed: 0,Movie_id,genero
0,2,Drama Crime
1,3,Drama Comedy
2,5,Crime Comedy
3,6,Action Thriller Crime
4,11,Adventure Action Science Fiction
...,...,...
42957,465044,Fantasy Drama
42958,467731,Drama
42959,468343,Drama Romance
42960,468707,Romance Comedy


In [25]:
# df to csv

In [26]:
df_genres_grouped.to_csv("ETL_movies_genres_grouped.csv",index =False)