# Preprocesamiento de Datos - Ejemplo Práctico

En esta demostración se preprocesará un conjunto de datos de población estadounidense. Los datos utilizados son un subconjunto modificado de [este set de datos](https://archive.ics.uci.edu/ml/datasets/Adult) y se encuentran en el archivo `census.csv`.

In [1]:
import pandas as pd

In [2]:
# Importemos los datos:
df = pd.read_csv('census.csv')


In [3]:
df.head()# Veamos el dataset:


Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
0,39.0,State-gov,Bachelors,White,Male,40.0,1.0,<=50K
1,50.0,Self-emp-not-inc,Bachelors,White,Male,13.0,1.0,<=50K
2,38.0,Private,High-school,White,Male,40.0,1.0,<=50K
3,53.0,Private,Some-high-school,Black,Male,40.0,1.0,<=50K
4,28.0,Private,Bachelors,Black,Female,40.0,0.0,<=50K


In [4]:
df.columns

Index(['age', 'workclass', 'education', 'race', 'sex', 'hours_per_week',
       'USA_born', 'label'],
      dtype='object')

In [5]:
# Verifiquemos si hay datos faltantes:
df.isnull().sum() #hacemos el isnull para saber cuantos valores nulos hay en el df. si se muestra un true: el valor es nulo y false: si existe un valor
#Luego el .sum hace que cuente cuantos true hay el el df (cuantos valores nulos)



age               99
workclass         11
education         14
race              16
sex               15
hours_per_week    85
USA_born          15
label              0
dtype: int64

In [6]:
# Descartemos las filas que tengan 3 o más datos faltantes:
(df.isnull().sum(axis=1) >=3).sum()# cuenta cuántos valores nulos tiene cada fila.


np.int64(22)

In [7]:
df = df[df.isnull().sum(axis=1) < 3] #se queda solo con las filas que tienen menos de 3 nulos.

In [8]:
df.isnull().sum()# Contemos de nuevo los datos faltantes:


age               85
workclass          0
education          0
race               0
sex                0
hours_per_week    68
USA_born           0
label              0
dtype: int64

In [9]:
# Imputemos los datos faltantes de edad y horas trabajadas por semana con la mediana de cada una de esas columnas:
mediana=df[["age","hours_per_week"]].median()
df[["age", "hours_per_week"]] = df[["age", "hours_per_week"]].fillna(mediana) #Esto sí modifica el DataFrame original correctamente y evita posibles advertencias de “SettingWithCopyWarning”.

In [10]:
# Contemos de nuevo los datos faltantes:
df.isnull().sum()

age               0
workclass         0
education         0
race              0
sex               0
hours_per_week    0
USA_born          0
label             0
dtype: int64

In [11]:
df["workclass"].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', 'Self-emp-inc', 'Without-pay'], dtype=object)

In [12]:
# Apliquemos one-hot encoding a la columna "workclass":
df["workclass"].head()

0           State-gov
1    Self-emp-not-inc
2             Private
3             Private
4             Private
Name: workclass, dtype: object

In [13]:
df = pd.get_dummies(df, columns=["workclass"],dtype=int) #get_dummies:crea las columnas binarias automáticamente.

In [14]:
df.filter(like="workclass_").head()

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0
4,0,0,1,0,0,0,0


In [15]:
# Hallemos los valores que toma la columna "education":
df["education"].unique()

array(['Bachelors', 'High-school', 'Some-high-school', 'Masters',
       'Some-college', 'Middle-school', 'Doctorate', 'Some-middle-school',
       'Preschool', 'Elementary-school'], dtype=object)

In [16]:

df["education"].value_counts()

education
High-school           14968
Some-college          10028
Bachelors              7767
Some-high-school       4153
Masters                2590
Middle-school           843
Doctorate               575
Some-middle-school      468
Elementary-school       229
Preschool                73
Name: count, dtype: int64

In [17]:
# Apliquemos ordinal encoding a la columna "education":
from sklearn.preprocessing import OrdinalEncoder


In [18]:
orden= [
    "Preschool",
    "Elementary-school",
    "Some-middle-school",
    "Middle-school",
    "Some-high-school",
    "High-school",
    "Some-college",
    "Bachelors",
    "Masters",
    "Doctorate"
]


In [19]:
encoder = OrdinalEncoder(categories=[orden])

In [20]:
df["education"]=encoder.fit_transform(df[["education"]])

In [21]:
# Verifiquemos que la columna "education" tenga los valores apropiados:
df["education"].head(10)

0    7.0
1    7.0
2    5.0
3    4.0
4    7.0
5    8.0
6    4.0
7    5.0
8    8.0
9    7.0
Name: education, dtype: float64

In [22]:
# Apliquemos one-hot encoding a la columna "race":
df["race"].head()

0    White
1    White
2    White
3    Black
4    Black
Name: race, dtype: object

In [23]:
df=pd.get_dummies(df, columns=["race"] ,dtype=int)

In [24]:
df.filter(like="race_").head()

Unnamed: 0,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,1,0,0
4,0,0,1,0,0


In [25]:
# Apliquemos binary encoding a la columna "sex":
df["sex"].unique()

array(['Male', 'Female'], dtype=object)

In [26]:
df.filter(like="sex_").head()

0
1
2
3
4


In [27]:
import category_encoders as ce

In [28]:
encoder_Sex=ce.BinaryEncoder(cols=["sex"]) #Convierte la columna sex (que tiene texto) en valores binarios (0 y 1)
df_encoded= encoder_Sex.fit_transform(df) #aprende cómo codificar las categorías y luego aplica esa codificación al dataset, devolviendo el resultado listo para usar en modelos de machine learning

df = df_encoded.copy()  

In [29]:
# Apliquemos binary encoding a la etiqueta:
df["label"].unique()

array(['<=50K', '>50K'], dtype=object)

In [30]:
encoder_Label=ce.BinaryEncoder(cols=["label"])
df= encoder_Label.fit_transform(df)

In [31]:
# Veamos el dataset resultante:
df.head()

Unnamed: 0,age,education,sex_0,sex_1,hours_per_week,USA_born,label_0,label_1,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,39.0,7.0,0,1,40.0,1.0,0,1,0,0,0,0,0,1,0,0,0,0,0,1
1,50.0,7.0,0,1,13.0,1.0,0,1,0,0,0,0,1,0,0,0,0,0,0,1
2,38.0,5.0,0,1,40.0,1.0,0,1,0,0,1,0,0,0,0,0,0,0,0,1
3,53.0,4.0,0,1,40.0,1.0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
4,28.0,7.0,1,0,40.0,0.0,0,1,0,0,1,0,0,0,0,0,0,1,0,0


In [32]:
# Veamos la nueva descripción del dataset:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41694 entries, 0 to 41715
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         41694 non-null  float64
 1   education                   41694 non-null  float64
 2   sex_0                       41694 non-null  int64  
 3   sex_1                       41694 non-null  int64  
 4   hours_per_week              41694 non-null  float64
 5   USA_born                    41694 non-null  float64
 6   label_0                     41694 non-null  int64  
 7   label_1                     41694 non-null  int64  
 8   workclass_Federal-gov       41694 non-null  int64  
 9   workclass_Local-gov         41694 non-null  int64  
 10  workclass_Private           41694 non-null  int64  
 11  workclass_Self-emp-inc      41694 non-null  int64  
 12  workclass_Self-emp-not-inc  41694 non-null  int64  
 13  workclass_State-gov         41694 no

In [33]:
# Verifiquemos el tipo de dato de cada columna:
df.dtypes

age                           float64
education                     float64
sex_0                           int64
sex_1                           int64
hours_per_week                float64
USA_born                      float64
label_0                         int64
label_1                         int64
workclass_Federal-gov           int64
workclass_Local-gov             int64
workclass_Private               int64
workclass_Self-emp-inc          int64
workclass_Self-emp-not-inc      int64
workclass_State-gov             int64
workclass_Without-pay           int64
race_Amer-Indian-Eskimo         int64
race_Asian-Pac-Islander         int64
race_Black                      int64
race_Other                      int64
race_White                      int64
dtype: object

In [34]:
df.isnull().sum()

age                           0
education                     0
sex_0                         0
sex_1                         0
hours_per_week                0
USA_born                      0
label_0                       0
label_1                       0
workclass_Federal-gov         0
workclass_Local-gov           0
workclass_Private             0
workclass_Self-emp-inc        0
workclass_Self-emp-not-inc    0
workclass_State-gov           0
workclass_Without-pay         0
race_Amer-Indian-Eskimo       0
race_Asian-Pac-Islander       0
race_Black                    0
race_Other                    0
race_White                    0
dtype: int64

In [35]:
# Carguemos el dataset a un nuevo archivo:
df.to_csv("dataset_preprocesado.csv", index=False)

In [None]:
df.to_excel("dataset_preprocesado.xlsx", index=False)