# Limpieza

In [22]:
import pandas as pd

# Cargar el CSV en un DataFrame
df_2016 = pd.read_csv('../data/raw/2016.csv')  

# Configurar pandas para mostrar más filas y columnas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Mostrar las primeras filas para exploración inicial
df_2016.head(5)


Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Lower Confidence Interval,Upper Confidence Interval,Economy (GDP per Capita),Family,Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity,Dystopia Residual
0,Denmark,Western Europe,1,7.526,7.46,7.592,1.44178,1.16374,0.79504,0.57941,0.44453,0.36171,2.73939
1,Switzerland,Western Europe,2,7.509,7.428,7.59,1.52733,1.14524,0.86303,0.58557,0.41203,0.28083,2.69463
2,Iceland,Western Europe,3,7.501,7.333,7.669,1.42666,1.18326,0.86733,0.56624,0.14975,0.47678,2.83137
3,Norway,Western Europe,4,7.498,7.421,7.575,1.57744,1.1269,0.79579,0.59609,0.35776,0.37895,2.66465
4,Finland,Western Europe,5,7.413,7.351,7.475,1.40598,1.13464,0.81091,0.57104,0.41004,0.25492,2.82596


In [23]:
# Eliminar columnas innecesarias (suponiendo que algunas no son útiles para el análisis)
columns_to_drop = ['Lower Confidence Interval', 'Upper Confidence Interval', 'Family', 'Dystopia Residual' ]

df_2016 = df_2016.drop(columns=columns_to_drop)

# Verificar las primeras filas después de eliminar las columnas
print("Después de eliminar columnas innecesarias:")
# Configurar pandas para mostrar más filas y columnas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df_2016.head(5)

Después de eliminar columnas innecesarias:


Unnamed: 0,Country,Region,Happiness Rank,Happiness Score,Economy (GDP per Capita),Health (Life Expectancy),Freedom,Trust (Government Corruption),Generosity
0,Denmark,Western Europe,1,7.526,1.44178,0.79504,0.57941,0.44453,0.36171
1,Switzerland,Western Europe,2,7.509,1.52733,0.86303,0.58557,0.41203,0.28083
2,Iceland,Western Europe,3,7.501,1.42666,0.86733,0.56624,0.14975,0.47678
3,Norway,Western Europe,4,7.498,1.57744,0.79579,0.59609,0.35776,0.37895
4,Finland,Western Europe,5,7.413,1.40598,0.81091,0.57104,0.41004,0.25492


In [24]:
# Normalización de nombres de columnas
df_2016.rename(columns={
    'Happiness Rank': 'Happiness_Rank',
    'Happiness Score': 'Happiness_Score',
    'Economy (GDP per Capita)': 'GDP_per_Capita',
    'Health (Life Expectancy)': 'Healthy_life_expectancy',
    'Freedom': 'Freedom',
    'Generosity': 'Generosity',
    'Trust (Government Corruption)': 'Perceptions_of_corruption'
}, inplace=True)

# Mostrar las primeras filas después de renombrar las columnas
print("\nPrimeras filas después de renombrar columnas:")
# Configurar pandas para mostrar más filas y columnas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Mostrar las primeras filas para exploración inicial
df_2016.head(5)


Primeras filas después de renombrar columnas:


Unnamed: 0,Country,Region,Happiness_Rank,Happiness_Score,GDP_per_Capita,Healthy_life_expectancy,Freedom,Perceptions_of_corruption,Generosity
0,Denmark,Western Europe,1,7.526,1.44178,0.79504,0.57941,0.44453,0.36171
1,Switzerland,Western Europe,2,7.509,1.52733,0.86303,0.58557,0.41203,0.28083
2,Iceland,Western Europe,3,7.501,1.42666,0.86733,0.56624,0.14975,0.47678
3,Norway,Western Europe,4,7.498,1.57744,0.79579,0.59609,0.35776,0.37895
4,Finland,Western Europe,5,7.413,1.40598,0.81091,0.57104,0.41004,0.25492


In [25]:
# Exploración inicial
print("Información general del archivo 2016:")
df_2016.info()

print("\nEstadísticas generales del archivo 2016:")
print(df_2016.describe())

Información general del archivo 2016:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Country                    157 non-null    object 
 1   Region                     157 non-null    object 
 2   Happiness_Rank             157 non-null    int64  
 3   Happiness_Score            157 non-null    float64
 4   GDP_per_Capita             157 non-null    float64
 5   Healthy_life_expectancy    157 non-null    float64
 6   Freedom                    157 non-null    float64
 7   Perceptions_of_corruption  157 non-null    float64
 8   Generosity                 157 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 11.2+ KB

Estadísticas generales del archivo 2016:
       Happiness_Rank  Happiness_Score  GDP_per_Capita  \
count      157.000000       157.000000      157.000000   
mean        78.980892

In [26]:
# Revisar la cantidad de valores nulos por columna
print("\nValores nulos por columna:")
print(df_2016.isnull().sum())


Valores nulos por columna:
Country                      0
Region                       0
Happiness_Rank               0
Happiness_Score              0
GDP_per_Capita               0
Healthy_life_expectancy      0
Freedom                      0
Perceptions_of_corruption    0
Generosity                   0
dtype: int64


In [27]:
# Verificar si hay filas duplicadas
duplicados = df_2016.duplicated().sum()
print(f"Filas duplicadas: {duplicados}")

# Si hay duplicados, imprimir las primeras filas duplicadas
if duplicados > 0:
    print("\nPrimeras filas duplicadas:")
    print(df_2016[df_2016.duplicated()].head())

Filas duplicadas: 0


In [None]:
df_2016.to_csv('../data/clean/2016_clean.csv', index=False)
print("archivo guardado exitosamente.")

archivo guardado exitosamente.


: 

# Analisis