# Actividades con Dataset Titanic


# Actividad 1: Análisis de supervivencia

##Objetivo
Realizar un análisis básico sobre los pasajeros del Titanic, enfocándose en la supervivencia.
1. Hacemos un EDA para familiarizarnos con los datos y tratar duplicados y nulos
2. Filtrar los pasajeros que sobrevivieron.
3. Seleccionar las columnas 'sex', 'age' y 'fare' para los pasajeros que sobrevivieron.
4. Crear una nueva columna que indique la clase de pasajero (Pclass) como una categoría.

In [63]:
# Importamos Librerias
import pandas as pd
import seaborn as sns

In [120]:
# Importamos el Dataset Titanic desde Seaborn
df = sns.load_dataset('titanic')
# titanic es dataset
# df es dataframe
#Pandas soporta series y dataframes

In [65]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


# 1. EDA, tratamiento de duplicados y nulos

In [67]:
# vemos si hay duplicados
df.duplicated().sum()

np.int64(107)

In [72]:
# Visualizamos los duplicados
df[df.duplicated(keep=False)].sort_values(by=['sex', 'age'])

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone


In [71]:
# Drop de duplicados / inplace=True sobrescribe df
df.drop_duplicates(inplace=True)

In [None]:
# Elimino duplicados y almaceno en un df nuevo
df_pp1 = df.drop_duplicates()

In [77]:
# Vemos si hay nulos
df.isnull().sum()

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,2
class,0
who,0


In [74]:
# Visualizamos los nulos
df[df.isnull().any(axis=1)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True
883,0,2,male,28.0,0,0,10.5000,S,Second,man,True,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False


In [None]:
# Visualizamos los nulos


In [None]:
# Completamos los nulos con la media
df["age"]=df["age"].fillna(df["age"].mean())

In [76]:
# O bien dejar que Pandas complete todas las columns numéricas de forma automática
df.fillna(df.mean(numeric_only=True), inplace=True)

## 2. Filtrar los pasajeros que sobrevivieron

In [84]:
# Info del df
# df["survived"].unique()
# df["survived"] 0 no sobrevivio / 1 si sobrevivio
df_sobrevivientes = df[df["survived"] == 1]
df_sobrevivientes


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [None]:
# Filtro con Query
df.query('survived == 1')

## 3. Seleccionar las columnas sex, age, fare de los que sobrevivieron

In [None]:
# Seleccionamos las columnas solicitadas
df_sobrevivientes = df_sobrevivientes[["sex", "age", "fare"]]
df_sobrevivientes

## 4. Crear una nueva columna que indique la clase de pasajero (Pclass) como una categoría.

In [92]:
# Veamos algo de dtype category
# df.info()
df["pclass"].unique()



array([3, 1, 2])

In [93]:
df["pclass"].dtypes

dtype('int64')

In [94]:
# Hacemos una copia del df original
df_pp1 = df.copy()

In [95]:
# Sobrescribo sobre el mismo dataframe
df_pp1["clasecategoria"] = df_pp1["pclass"].astype("category")

In [96]:
df_pp1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 784 entries, 0 to 890
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   survived        784 non-null    int64   
 1   pclass          784 non-null    int64   
 2   sex             784 non-null    object  
 3   age             784 non-null    float64 
 4   sibsp           784 non-null    int64   
 5   parch           784 non-null    int64   
 6   fare            784 non-null    float64 
 7   embarked        782 non-null    object  
 8   class           784 non-null    category
 9   who             784 non-null    object  
 10  adult_male      784 non-null    bool    
 11  deck            202 non-null    category
 12  embark_town     782 non-null    object  
 13  alive           784 non-null    object  
 14  alone           784 non-null    bool    
 15  clasecategoria  784 non-null    category
dtypes: bool(2), category(3), float64(2), int64(4), object(5)
memory usa

In [97]:
df_pp1[["pclass", "clasecategoria"]]

Unnamed: 0,pclass,clasecategoria
0,3,3
1,1,1
2,3,3
3,1,1
4,3,3
...,...,...
885,3,3
887,1,1
888,3,3
889,1,1


In [103]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [101]:
# df.info()
df["alive"].dtypes

dtype('O')

In [102]:
df["alive"].unique()

array(['no', 'yes'], dtype=object)

In [104]:
df_pp1["alivecategoria"] = df_pp1["alive"].astype("category")

In [105]:
df_pp1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 784 entries, 0 to 890
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   survived        784 non-null    int64   
 1   pclass          784 non-null    int64   
 2   sex             784 non-null    object  
 3   age             784 non-null    float64 
 4   sibsp           784 non-null    int64   
 5   parch           784 non-null    int64   
 6   fare            784 non-null    float64 
 7   embarked        782 non-null    object  
 8   class           784 non-null    category
 9   who             784 non-null    object  
 10  adult_male      784 non-null    bool    
 11  deck            202 non-null    category
 12  embark_town     782 non-null    object  
 13  alive           784 non-null    object  
 14  alone           784 non-null    bool    
 15  clasecategoria  784 non-null    category
 16  alivecategoria  784 non-null    category
dtypes: bool(2), category(

In [106]:
df_pp1[["alive", "alivecategoria"]]

Unnamed: 0,alive,alivecategoria
0,no,no
1,yes,yes
2,yes,yes
3,yes,yes
4,no,no
...,...,...
885,no,no
887,yes,yes
888,no,no
889,yes,yes


In [107]:
df_pp1["alivecategoria"].dtypes

CategoricalDtype(categories=['no', 'yes'], ordered=False, categories_dtype=object)

Tipo de dato category


In [None]:
# Convertimos el tipo de dato con astype


In [None]:
# Validamos


# Actividad 2: Manipulación de datos

## Objetivo
Practicar la manipulación de datos haciendo selecciones y transformaciones sobre el conjunto de datos de Titanic.
1. Obtener los nombres de las columnas del DataFrame.
2. Eliminar la columna 'deck'. (tener en cuenta que puede contener valores nulos y la función puede dar advertencias).
3. Reindexar el DataFrame después de eliminar la columna.

In [121]:
# Listar las columnas
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [112]:
# Eliminar la columna deck
# df.drop(columns=["deck", "alone"], inplace=True)
df.drop("who", axis=1, inplace=True) # sobrescribe df
df_pp1 = df.drop("who", axis=1) # retorna un df que se almacena en df_pp1

In [115]:
# Agregar una validación
if "embark_town" in df.columns:
  df.drop("embark_town", axis=1, inplace=True)
else:
  print("La columna 'who' no existe en el DataFrame.")

In [118]:
# Reindexar (solo tiene sentido a nivel de filas no de columnas)
df.reset_index(drop=True, inplace=True)

Unnamed: 0,0
survived,0
pclass,0
sex,0
age,0
sibsp,0
parch,0
fare,0
embarked,2
class,0
adult_male,0
