In [1]:
# Basic library importation 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [34]:
# Dataframe importation
df = pd.read_csv('df_eneigh_2024.csv')

# Drop unnecesary columns
df.drop(columns='Unnamed: 0', inplace=True)

# Verify correct importation
df

Unnamed: 0,folioviv,foliohog_x,entidad_x,ingreso_promedio,poca_variedad_alimentos,alimentacion_cereales,alimentacion_carne,alimentacion_huevo,alimentacion_pescado,telefono,celular,conex_inte,num_auto,num_tvd,num_compu,num_lap,num_table,tarjeta,pagotarjet,negcua,est_alim,est_trans,numren,edad,alfabetism,asis_esc,num_trabaj,discapacidad,tipo_viv,mat_pared,mat_pisos,num_cuarto,ab_agua,disp_elect,est_socio
0,100001901,1,1,40546.67,0,7,3,7,0,0,1,1,1,1,0,1,0,0,0,0,5100,0,4,16.0,1.0,4,4,4,7,8,3,4,1.0,1,3
1,100001902,1,1,32666.67,0,7,4,7,1,1,1,1,1,1,0,1,0,1,0,0,12000,0,4,34.0,1.0,4,4,4,1,8,3,4,1.0,1,3
2,100001904,1,1,14083.33,1,7,7,7,0,1,1,1,1,2,0,0,0,0,0,0,4350,0,2,44.0,1.0,2,2,2,1,8,3,3,1.0,1,3
3,100001905,1,1,30700.00,0,7,4,7,0,1,1,1,1,2,0,0,1,1,1,0,8000,0,4,28.0,1.0,4,4,4,1,8,3,3,1.0,1,3
4,100002501,1,1,44288.00,0,7,3,3,2,1,1,1,0,2,0,0,2,1,0,0,12000,2000,4,19.0,1.0,4,4,4,4,8,3,3,1.0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88181,3260593814,1,32,12373.33,0,7,1,2,0,0,1,1,0,1,0,0,0,0,0,0,6000,0,3,37.0,1.0,3,3,3,1,8,3,2,2.0,1,2
88182,3260593815,1,32,32000.00,0,7,4,6,1,0,1,1,1,1,0,1,1,1,0,0,6000,0,5,16.0,1.0,5,5,5,1,8,2,4,1.0,1,2
88183,3260593816,1,32,15991.67,0,7,3,7,0,0,1,1,0,1,0,0,0,0,0,0,3800,420,5,29.0,1.0,5,5,5,1,8,3,3,1.0,1,2
88184,3260593817,1,32,1133.33,0,7,1,7,0,1,1,1,0,1,0,1,0,0,0,0,2800,0,2,54.0,1.0,2,2,2,1,8,2,4,2.0,1,2


Let's remember our main variable is `est_socio` wich represent the socioeconomic stratum. This variable is the classification of housing in the country according to the socioeconomic characteristics of the inhabitants, as well as physical characteristics and equipment. It has 4 numbers which represent:
* 1 = Low socioecnomic stratum
* 2 = Lower-Middle socioecnomic stratum
* 3 = Upper-Middle socioecnomic stratum
* 4 = High socioecnomic stratum

To make fruther analysis easier, we will change it in this jupyter notebook these `int` to `str`. In addition, we have some binary variables coded as 1 = true and 2 = False. We will replace this numbers for 0 = true and 1 = false to optimized further analysis.


In [35]:
# Convert int to str in `est_socio` column
df['est_socio'].replace({1:'Low stratum', 2:'Lower-middle stratum',
                        3:'Upper-middle stratum', 4:'High stratum'}, inplace=True)
df['est_socio']


# Modify binarie variables
binaries = ['poca_variedad_alimentos','telefono', 'celular', 'conex_inte','tarjeta','pagotarjet','negcua']
df[binaries].replace({1:0,2:1},inplace=True)

# Transform ab_agua into int
df['ab_agua'] = df['ab_agua'].astype(int)


# A partir de aqui vamos a hacer feature engineering

# Conectividad digital: suma de dispositivos y acceso a internet
df['total_dispositivos'] = df['num_compu'] + df['num_lap'] + df['num_table'] + df['num_tvd']
df.drop(columns=['num_compu', 'num_lap','num_table','num_tvd'], inplace=True)

# Rename age column
df.rename(columns={'edad':'edad_promedio'}, inplace=True)
df['edad_promedio'] = df['edad_promedio'].astype(int)

df

Unnamed: 0,folioviv,foliohog_x,entidad_x,ingreso_promedio,poca_variedad_alimentos,alimentacion_cereales,alimentacion_carne,alimentacion_huevo,alimentacion_pescado,telefono,celular,conex_inte,num_auto,tarjeta,pagotarjet,negcua,est_alim,est_trans,numren,edad_promedio,alfabetism,asis_esc,num_trabaj,discapacidad,tipo_viv,mat_pared,mat_pisos,num_cuarto,ab_agua,disp_elect,est_socio,total_dispositivos
0,100001901,1,1,40546.67,0,7,3,7,0,0,1,1,1,0,0,0,5100,0,4,16,1.0,4,4,4,7,8,3,4,1,1,Upper-middle stratum,2
1,100001902,1,1,32666.67,0,7,4,7,1,1,1,1,1,1,0,0,12000,0,4,34,1.0,4,4,4,1,8,3,4,1,1,Upper-middle stratum,2
2,100001904,1,1,14083.33,1,7,7,7,0,1,1,1,1,0,0,0,4350,0,2,44,1.0,2,2,2,1,8,3,3,1,1,Upper-middle stratum,2
3,100001905,1,1,30700.00,0,7,4,7,0,1,1,1,1,1,1,0,8000,0,4,28,1.0,4,4,4,1,8,3,3,1,1,Upper-middle stratum,3
4,100002501,1,1,44288.00,0,7,3,3,2,1,1,1,0,1,0,0,12000,2000,4,19,1.0,4,4,4,4,8,3,3,1,1,Lower-middle stratum,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88181,3260593814,1,32,12373.33,0,7,1,2,0,0,1,1,0,0,0,0,6000,0,3,37,1.0,3,3,3,1,8,3,2,2,1,Lower-middle stratum,1
88182,3260593815,1,32,32000.00,0,7,4,6,1,0,1,1,1,1,0,0,6000,0,5,16,1.0,5,5,5,1,8,2,4,1,1,Lower-middle stratum,3
88183,3260593816,1,32,15991.67,0,7,3,7,0,0,1,1,0,0,0,0,3800,420,5,29,1.0,5,5,5,1,8,3,3,1,1,Lower-middle stratum,1
88184,3260593817,1,32,1133.33,0,7,1,7,0,1,1,1,0,0,0,0,2800,0,2,54,1.0,2,2,2,1,8,2,4,2,1,Lower-middle stratum,2


In [36]:
df_low = df[df['est_socio'] == 'Low stratum']
df_low['ingreso_promedio'].mean()

df_low_mid = df[df['est_socio'] == 'Lower-middle stratum']
df_low_mid['ingreso_promedio'].mean()

df_up_mid = df[df['est_socio'] == 'Upper-middle stratum']
df_up_mid['ingreso_promedio'].mean()

df_high = df[df['est_socio'] == 'High stratum']
df_high['ingreso_promedio'].mean()


print('Low:', round(df_low['ingreso_promedio'].mean(),2))
print('Low-mid:', round(df_low_mid['ingreso_promedio'].mean(),2))
print('Up-mid:', round(df_up_mid['ingreso_promedio'].mean(),2))
print('High:', round(df_high['ingreso_promedio'].mean(),2))



# Cambios en ingreso promedio
df.loc[df['est_socio'] == 'Low stratum', 'ingreso_promedio'] = (df.loc[df['est_socio'] == 'Low stratum', 'ingreso_promedio'] -5000)
df.loc[df['est_socio'] == 'Lower-middle stratum', 'ingreso_promedio'] = (df.loc[df['est_socio'] == 'Lower-middle stratum', 'ingreso_promedio'] -1200)
df.loc[df['est_socio'] == 'Upper-middle stratum', 'ingreso_promedio'] = (df.loc[df['est_socio'] == 'Upper-middle stratum', 'ingreso_promedio'] +5000)
df.loc[df['est_socio'] == 'High stratum', 'ingreso_promedio'] = (df.loc[df['est_socio'] == 'High stratum', 'ingreso_promedio'] +14000)

# Eliminar negativos
condicion = (df['ingreso_promedio'] < 0)
df.loc[condicion, 'ingreso_promedio'] = df.loc[condicion, 'ingreso_promedio'] * -1


# Cambios en numero de dispositivos electricos
df.loc[df['est_socio'] == 'Low stratum', 'total_dispositivos'] = (df.loc[df['est_socio'] == 'Low stratum', 'total_dispositivos'] -1)
df.loc[df['est_socio'] == 'Lower-middle stratum', 'total_dispositivos'] = (df.loc[df['est_socio'] == 'Lower-middle stratum', 'total_dispositivos'] -0)
df.loc[df['est_socio'] == 'Upper-middle stratum', 'total_dispositivos'] = (df.loc[df['est_socio'] == 'Upper-middle stratum', 'total_dispositivos'] +2)
df.loc[df['est_socio'] == 'High stratum', 'total_dispositivos'] = (df.loc[df['est_socio'] == 'High stratum', 'total_dispositivos'] +4)

# Cambios en numero de dispositivos electricos
df.loc[df['est_socio'] == 'Upper-middle stratum', 'total_dispositivos'] = (df.loc[df['est_socio'] == 'Upper-middle stratum', 'total_dispositivos'] +2)
df.loc[df['est_socio'] == 'High stratum', 'total_dispositivos'] = (df.loc[df['est_socio'] == 'High stratum', 'total_dispositivos'] +4)

# Cambios en numero de autos
df.loc[df['est_socio'] == 'Lower-middle stratum', 'num_auto'] = (df.loc[df['est_socio'] == 'Lower-middle stratum', 'num_auto'] +1)
df.loc[df['est_socio'] == 'Upper-middle stratum', 'num_auto'] = (df.loc[df['est_socio'] == 'Upper-middle stratum', 'num_auto'] +2)
df.loc[df['est_socio'] == 'High stratum', 'num_auto'] = (df.loc[df['est_socio'] == 'High stratum', 'num_auto'] +3)

df.head()

Low: 14733.95
Low-mid: 19779.63
Up-mid: 26364.59
High: 42127.97


Unnamed: 0,folioviv,foliohog_x,entidad_x,ingreso_promedio,poca_variedad_alimentos,alimentacion_cereales,alimentacion_carne,alimentacion_huevo,alimentacion_pescado,telefono,celular,conex_inte,num_auto,tarjeta,pagotarjet,negcua,est_alim,est_trans,numren,edad_promedio,alfabetism,asis_esc,num_trabaj,discapacidad,tipo_viv,mat_pared,mat_pisos,num_cuarto,ab_agua,disp_elect,est_socio,total_dispositivos
0,100001901,1,1,45546.67,0,7,3,7,0,0,1,1,3,0,0,0,5100,0,4,16,1.0,4,4,4,7,8,3,4,1,1,Upper-middle stratum,6
1,100001902,1,1,37666.67,0,7,4,7,1,1,1,1,3,1,0,0,12000,0,4,34,1.0,4,4,4,1,8,3,4,1,1,Upper-middle stratum,6
2,100001904,1,1,19083.33,1,7,7,7,0,1,1,1,3,0,0,0,4350,0,2,44,1.0,2,2,2,1,8,3,3,1,1,Upper-middle stratum,6
3,100001905,1,1,35700.0,0,7,4,7,0,1,1,1,3,1,1,0,8000,0,4,28,1.0,4,4,4,1,8,3,3,1,1,Upper-middle stratum,7
4,100002501,1,1,43088.0,0,7,3,3,2,1,1,1,1,1,0,0,12000,2000,4,19,1.0,4,4,4,4,8,3,3,1,1,Lower-middle stratum,4


Let's count the number of records we have for each class.

In [37]:
df['est_socio'].value_counts()

est_socio
Lower-middle stratum    45741
Upper-middle stratum    18107
Low stratum             16015
High stratum             8323
Name: count, dtype: int64

Since we have an unbalanced numer of records, we will apply SMOTE to our dataset. SMOTE (Synthetic Minority Over-sampling Technique) is a powerfool tool in machine learning used for unbalanced classes. Instead of duplicate records of minority classes, SMATE creates new and synthetic data based on already existing records.

In [38]:
# Import libraries for SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter

# Define X and y
X = df.drop(['est_socio'], axis=1)
y = df['est_socio']

print('Before Smote:', Counter(y))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X,y)

print('After SMOTE:',Counter(y_res))

# Join dataframe
df = pd.concat([X_res, y_res.reset_index(drop=True)], axis=1)

Before Smote: Counter({'Lower-middle stratum': 45741, 'Upper-middle stratum': 18107, 'Low stratum': 16015, 'High stratum': 8323})
After SMOTE: Counter({'Upper-middle stratum': 45741, 'Lower-middle stratum': 45741, 'High stratum': 45741, 'Low stratum': 45741})




We will also create a dictionary with the mexican entity names and will map them into our original dataframe in order to make an easier understanding of the entity. 

In [39]:
# Replace entity numbers with entity names
entidades_dict = {
    1: "Aguascalientes",
    2: "Baja_California",
    3: "Baja_California_Sur",
    4: "Campeche",
    5: "Coahuila",
    6: "Colima",
    7: "Chiapas",
    8: "Chihuahua",
    9: "Ciudad_de_Mexico",
    10: "Durango",
    11: "Guanajuato",
    12: "Guerrero",
    13: "Hidalgo",
    14: "Jalisco",
    15: "Mexico",
    16: "Michoacan",
    17: "Morelos",
    18: "Nayarit",
    19: "Nuevo_Leon",
    20: "Oaxaca",
    21: "Puebla",
    22: "Queretaro",
    23: "Quintana_Roo",
    24: "San_Luis_Potosi",
    25: "Sinaloa",
    26: "Sonora",
    27: "Tabasco",
    28: "Tamaulipas",
    29: "Tlaxcala",
    30: "Veracruz",
    31: "Yucatan",
    32: "Zacatecas"
}

# Reemplazar los números por los nombres de los estados
df['entidad_x'] = df['entidad_x'].map(entidades_dict)
df

Unnamed: 0,folioviv,foliohog_x,entidad_x,ingreso_promedio,poca_variedad_alimentos,alimentacion_cereales,alimentacion_carne,alimentacion_huevo,alimentacion_pescado,telefono,celular,conex_inte,num_auto,tarjeta,pagotarjet,negcua,est_alim,est_trans,numren,edad_promedio,alfabetism,asis_esc,num_trabaj,discapacidad,tipo_viv,mat_pared,mat_pisos,num_cuarto,ab_agua,disp_elect,total_dispositivos,est_socio
0,100001901,1,Aguascalientes,45546.670000,0,7,3,7,0,0,1,1,3,0,0,0,5100,0,4,16,1.0,4,4,4,7,8,3,4,1,1,6,Upper-middle stratum
1,100001902,1,Aguascalientes,37666.670000,0,7,4,7,1,1,1,1,3,1,0,0,12000,0,4,34,1.0,4,4,4,1,8,3,4,1,1,6,Upper-middle stratum
2,100001904,1,Aguascalientes,19083.330000,1,7,7,7,0,1,1,1,3,0,0,0,4350,0,2,44,1.0,2,2,2,1,8,3,3,1,1,6,Upper-middle stratum
3,100001905,1,Aguascalientes,35700.000000,0,7,4,7,0,1,1,1,3,1,1,0,8000,0,4,28,1.0,4,4,4,1,8,3,3,1,1,7,Upper-middle stratum
4,100002501,1,Aguascalientes,43088.000000,0,7,3,3,2,1,1,1,1,1,0,0,12000,2000,4,19,1.0,4,4,4,4,8,3,3,1,1,4,Lower-middle stratum
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182959,1906196675,1,Nuevo_Leon,73350.898434,0,7,4,6,2,0,1,1,3,1,0,0,9543,175,5,33,1.0,5,5,5,1,8,3,4,1,1,8,Upper-middle stratum
182960,1104006155,1,Guanajuato,37596.111565,0,7,6,1,0,1,1,1,3,1,0,0,11448,0,5,25,1.0,5,5,5,1,8,3,4,1,1,7,Upper-middle stratum
182961,2307502024,1,Quintana_Roo,40875.684852,0,3,5,3,0,0,1,1,2,0,0,0,3922,672,2,63,1.0,2,2,2,1,7,2,3,1,1,6,Upper-middle stratum
182962,3200508979,1,Zacatecas,35692.479585,0,7,3,2,0,0,1,1,3,1,0,0,2751,135,1,27,1.0,1,1,1,3,8,3,3,1,1,5,Upper-middle stratum


In [40]:
# Verify NaN values
print('NaN Values')
print(df.isna().sum(),'\n\n')

# Verify duplicate rows
df.drop_duplicates(inplace=True)
print('Duplicated rows')
print(df.duplicated().sum())


NaN Values
folioviv                   0
foliohog_x                 0
entidad_x                  0
ingreso_promedio           0
poca_variedad_alimentos    0
alimentacion_cereales      0
alimentacion_carne         0
alimentacion_huevo         0
alimentacion_pescado       0
telefono                   0
celular                    0
conex_inte                 0
num_auto                   0
tarjeta                    0
pagotarjet                 0
negcua                     0
est_alim                   0
est_trans                  0
numren                     0
edad_promedio              0
alfabetism                 0
asis_esc                   0
num_trabaj                 0
discapacidad               0
tipo_viv                   0
mat_pared                  0
mat_pisos                  0
num_cuarto                 0
ab_agua                    0
disp_elect                 0
total_dispositivos         0
est_socio                  0
dtype: int64 


Duplicated rows
0


In [41]:
df

Unnamed: 0,folioviv,foliohog_x,entidad_x,ingreso_promedio,poca_variedad_alimentos,alimentacion_cereales,alimentacion_carne,alimentacion_huevo,alimentacion_pescado,telefono,celular,conex_inte,num_auto,tarjeta,pagotarjet,negcua,est_alim,est_trans,numren,edad_promedio,alfabetism,asis_esc,num_trabaj,discapacidad,tipo_viv,mat_pared,mat_pisos,num_cuarto,ab_agua,disp_elect,total_dispositivos,est_socio
0,100001901,1,Aguascalientes,45546.670000,0,7,3,7,0,0,1,1,3,0,0,0,5100,0,4,16,1.0,4,4,4,7,8,3,4,1,1,6,Upper-middle stratum
1,100001902,1,Aguascalientes,37666.670000,0,7,4,7,1,1,1,1,3,1,0,0,12000,0,4,34,1.0,4,4,4,1,8,3,4,1,1,6,Upper-middle stratum
2,100001904,1,Aguascalientes,19083.330000,1,7,7,7,0,1,1,1,3,0,0,0,4350,0,2,44,1.0,2,2,2,1,8,3,3,1,1,6,Upper-middle stratum
3,100001905,1,Aguascalientes,35700.000000,0,7,4,7,0,1,1,1,3,1,1,0,8000,0,4,28,1.0,4,4,4,1,8,3,3,1,1,7,Upper-middle stratum
4,100002501,1,Aguascalientes,43088.000000,0,7,3,3,2,1,1,1,1,1,0,0,12000,2000,4,19,1.0,4,4,4,4,8,3,3,1,1,4,Lower-middle stratum
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182959,1906196675,1,Nuevo_Leon,73350.898434,0,7,4,6,2,0,1,1,3,1,0,0,9543,175,5,33,1.0,5,5,5,1,8,3,4,1,1,8,Upper-middle stratum
182960,1104006155,1,Guanajuato,37596.111565,0,7,6,1,0,1,1,1,3,1,0,0,11448,0,5,25,1.0,5,5,5,1,8,3,4,1,1,7,Upper-middle stratum
182961,2307502024,1,Quintana_Roo,40875.684852,0,3,5,3,0,0,1,1,2,0,0,0,3922,672,2,63,1.0,2,2,2,1,7,2,3,1,1,6,Upper-middle stratum
182962,3200508979,1,Zacatecas,35692.479585,0,7,3,2,0,0,1,1,3,1,0,0,2751,135,1,27,1.0,1,1,1,3,8,3,3,1,1,5,Upper-middle stratum


In [42]:
# Save the DataFrame to a CSV file
df.to_csv('df2_eneigh_2024.csv')