# Filtros y selecciones en Pandas

## Importamos Dataset y hacemos un EDA

In [1]:
# Importar la librería Pandas
import pandas as pd

In [2]:
# Montar la unidad
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Importamos el Dataset Netflix
df = pd.read_csv("/content/drive/MyDrive/datasets/netflix_titles.csv")

In [4]:
# Ver información general del Dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [None]:
df.head()

In [None]:
# Rápida exploración de
df.duplicated().sum()

In [None]:
# Rápida exploración de nulos
df.isnull().sum()
df[df.isnull().any(axis=1)]

## Selección de columnas y filas

In [6]:
# Visualizar columnas
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [7]:
# Observar o filtrar una columna
df["title"]

Unnamed: 0,title
0,Dick Johnson Is Dead
1,Blood & Water
2,Ganglands
3,Jailbirds New Orleans
4,Kota Factory
...,...
8802,Zodiac
8803,Zombie Dumb
8804,Zombieland
8805,Zoom


In [8]:
# Observar o filtrar más de una columna
df[["title", "country"]]

Unnamed: 0,title,country
0,Dick Johnson Is Dead,United States
1,Blood & Water,South Africa
2,Ganglands,
3,Jailbirds New Orleans,
4,Kota Factory,India
...,...,...
8802,Zodiac,United States
8803,Zombie Dumb,
8804,Zombieland,United States
8805,Zoom,United States


In [9]:
# Agregamos ordenamiento
df[["title", "country"]].sort_values(by="title")

Unnamed: 0,title,country
2036,#Alive,South Korea
2304,#AnneFrank - Parallel Stories,Italy
2481,#FriendButMarried,Indonesia
2324,#FriendButMarried 2,Indonesia
5973,#Roxy,Canada
...,...,...
6177,忍者ハットリくん,Japan
4914,海的儿子,
7101,마녀사냥,South Korea
5022,반드시 잡는다,South Korea


In [10]:
# Seleccionar filas específicas
df.loc[1:5] # Equivalente a un head

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...


In [11]:
# Seleccionar un subset (filas y columnas especíicas)
df.loc[1:5, ["title", "country"]]

Unnamed: 0,title,country
1,Blood & Water,South Africa
2,Ganglands,
3,Jailbirds New Orleans,
4,Kota Factory,India
5,Midnight Mass,


## Filtros

### Filtros con condiciones lógicas

In [None]:
# Filtrar los registros que sean Movie
df[df["type"] == "Movie"]

In [None]:
# df.head(2)
df["type"].unique()

In [None]:
# Listar los registros cuyo campo country es Argentina
df[df["country"]=="Argentina"]

In [None]:
# Listar los registros (movies/TV Shows) cuyo "release_year" es anterior al 2020
df[df["release_year"] < 2020].sort_values(by="release_year", ascending=False)


In [None]:
# Listar los registros (movies/TV Shows) cuyo date_added es anterior al 2020
df[df["date_added"] < 2020] # Analizar el error

In [None]:
# Comparar el dtype de las columnas "release_year" y "date_added"
df[["release_year", "date_added"]].dtypes

In [None]:
# Listar registros random
df.sample(10)[["release_year", "date_added"]]

In [5]:
# Convertimos el dtype de "date_added" a datetime
df["date_added_dt"] = pd.to_datetime(df["date_added"].str.strip())

In [6]:
# Validamos el nuevo dtype
df["date_added_dt"].dtypes

dtype('<M8[ns]')

In [None]:
# Ejecutamos nuevamente la consulta
df[df["date_added_dt"].dt.year>2020].sort_values(by="release_year", ascending=False)

### Combinación de condiciones (&, |)

In [None]:
# Mostrar máximo 10 filas
pd.set_option("display.max_rows", 10)

In [None]:
# Filtrar las películas Argentinas cuyo time_added sea posterior al 2020
df[(df["date_added_dt"].dt.year > 2020) & (df["country"]=="Argentina")][["type", "country", "date_added_dt", "title"]]

In [None]:
# Listar las registros cuyo title incluya las palabra "cielo"
df[df["title"].str.contains("cielo", case=False)]

In [None]:
# Filtrar las películas cuyo director contiene Johnson
df[(df["type"]=="Movie") & (df["director"].str.contains("johnson", case=False))][["type", "title", "director"]]

In [None]:
# Listar los registros (movies/TV shows) de ["Argentina", "Mexico", "Chile"]
df[df["country"].isin(["Argentina", "Mexico", "Chile"])]

In [None]:
# Usar la IA para mejorar el filtro y que no sea case sensitive
countries = ["Argentina", "Mexico", "Chile"]
countries = [country.lower() for country in countries]
df[df["country"].str.lower().isin(countries)]