In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.model_selection import KFold

## Parte I ##

# 2. Importar datos
hogares = pd.read_excel("C:/Users/tomas/Documents/UdeSA/Cuarto Año/Primer Cuatri/Big Data/TP4/usu_hogar_T423.xlsx")

hogares = hogares[hogares['AGLOMERADO'].isin([32, 33])]

individuos = pd.read_excel("C:/Users/tomas/Documents/UdeSA/Cuarto Año/Primer Cuatri/Big Data/TP4/usu_individual_T423.xlsx")

individuos = individuos[individuos['AGLOMERADO'].isin([32, 33])]






























In [None]:
# Mergear base de individuos con la de los hogares

df = pd.merge(hogares, individuos, on=['CODUSU', 'NRO_HOGAR'], suffixes=('_x', '_y'))
columns_to_drop = [col for col in df if col.endswith('_y')]
df.drop(columns=columns_to_drop, inplace=True)
df.columns = [col.rstrip('_x') if col.endswith('_x') else col for col in df.columns]



In [None]:
# 3. Limpiar base

#Variable categorica
df['MAS_500'] = np.where(df['MAS_500'] == 'S', 1, 0)

# Missing values: primero saque missings en p47T y luego elimine las columnas con missing
missing_values = df.isna().sum()
df = df.dropna(subset=['P47T'])
df = df.dropna(axis=1)

# Outliers o valores sin sentido

df = df[df['CH06'] > 0]

cols = ['P47T', 'T_VI' , 'ITF', 'IPCF']

for i in range(0, len(cols)):   
    plt.figure(figsize = (10, 6))
    df.boxplot(column = cols[i])
    plt.title(cols[i])
    plt.xticks(rotation = 45)
    plt.show()



In [None]:
# 4. Variables para predecir pobreza

#Propocion de niños en el hogar
casa = df.groupby('CODUSU')
total_personas = casa.size()
child = casa.apply(lambda x: (x['CH06'] < 18).sum())
proporcion_jovenes = child/total_personas
proporcion_jovenes = proporcion_jovenes.reset_index(name='proporcion_jovenes')
df = pd.merge(df, proporcion_jovenes, on='CODUSU')

#Proporcion de personas que trabajan
trabaja = casa.apply(lambda x: (x['ESTADO'] == 1).sum())
proporcion_trabaja = trabaja/total_personas
proporcion_trabaja = proporcion_trabaja.reset_index(name='proporcion_trabaja')
df = pd.merge(df, proporcion_trabaja, on='CODUSU')

#Proporcion de personas que saben leer y escribir
leer_escribir = casa.apply(lambda x: (x['CH09'] == 1).sum())
proporcion_leer = leer_escribir/total_personas
proporcion_leer = proporcion_leer.reset_index(name='proporcion_leer')
df = pd.merge(df, proporcion_leer, on='CODUSU')



In [None]:
# 5. Estadisticas Descriptivas

desc = ['ITF', 'NIVEL_ED', 'P21', 'proporcion_jovenes', 'proporcion_trabaja']

def format_stats(stats):
    return stats.apply(lambda x: "{:,.2f}".format(x) if isinstance(x, (int, float)) else x)

for column in desc:
    desc_stats = df[column].describe().to_frame().T
    formatted_stats = format_stats(desc_stats)
    print(f"Estadistica descriptiva para '{column}':")
    print(formatted_stats.to_string(index=False))
    print()



In [None]:
# 6. Relacion entre variables

#P47T y CH06

pr = df[df['P47T'] < 1000000]

plt.scatter(pr['CH06'], pr['P47T'], alpha=0.5)
plt.title("Scatter Plot of CH06 and P47T")
plt.ylabel("P47T")
plt.xlabel("CH06")
plt.grid(True)
plt.show()



In [None]:
# 7. Necesidades energeticas

bins = [-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 29, 45, 60, 75, float('inf')]
labels = [
    'less_than_1', '1_year', '2_year', '3_year', '4_year', '5_year', '6_year', '7_year', '8_year', '9_year', '10_year', 
    '11_year', '12_year', '13_year', '14_year', '15_year', '16_year', '17_year', '18_to_29', '30_to_45', 
    '46_to_60', '61_to_75', 'more_than_75'
]

df['age_group'] = pd.cut(df['CH06'], bins=bins, labels=labels)

#Dummie por genero y edad
for sex in [1, 2]:
    for label in labels:
        dummy_name = f'sex_{sex}_age_{label}'
        df[dummy_name] = ((df['CH04'] == sex) & (df['age_group'] == label)).astype(int)

energy_needs = {
    'less_than_1': {1: 0.35, 2: 0.35},
    '1_year': {1: 0.37, 2: 0.37},
    '2_year': {1: 0.46, 2: 0.46},
    '3_year': {1: 0.51, 2: 0.51},
    '4_year': {1: 0.55, 2: 0.55},
    '5_year': {1: 0.60, 2: 0.60},
    '6_year': {1: 0.64, 2: 0.64},
    '7_year': {1: 0.66, 2: 0.66},
    '8_year': {1: 0.68, 2: 0.68},
    '9_year': {1: 0.69, 2: 0.69},
    '10_year': {1: 0.79, 2: 0.70},
    '11_year': {1: 0.82, 2: 0.72},
    '12_year': {1: 0.85, 2: 0.74},
    '13_year': {1: 0.90, 2: 0.76},
    '14_year': {1: 0.96, 2: 0.76},
    '15_year': {1: 1.00, 2: 0.77},
    '16_year': {1: 1.03, 2: 0.77},
    '17_year': {1: 1.04, 2: 0.77},
    '18_to_29': {1: 1.02, 2: 0.76},
    '30_to_45': {1: 1.00, 2: 0.77},
    '46_to_60': {1: 1.00, 2: 0.76},
    '61_to_75': {1: 0.83, 2: 0.67},
    'more_than_75': {1: 0.74, 2: 0.63}
}

for sex in [1, 2]:
    for label in labels:
        dummy_name = f'sex_{sex}_age_{label}'
        energy_value = energy_needs[label][sex]
        df[dummy_name] = df[dummy_name] * energy_value

def get_positive_energy_value(row):
    for sex in [1, 2]:
        for label in labels:
            dummy_name = f'sex_{sex}_age_{label}'
            if row[dummy_name] > 0:
                return row[dummy_name]
    return 0

df['adulto_equiv'] = df.apply(get_positive_energy_value, axis=1)

df = df.loc[:, ~df.columns.str.startswith('sex_')]

df.drop(columns=['age_group'], inplace=True)

#Agrupamos por hogar CODUSU  ad equiv hogar
 
df['ad_equiv_hogar'] = df.groupby('CODUSU')['adulto_equiv'].transform('sum')

test = df[['CODUSU','CH04', 'CH06', 'adulto_equiv', 'ad_equiv_hogar' , 'ITF']]



In [None]:
# 8. Separar base y clasificar en pobre/no pobre

respondieron = df[df['ITF'] > 0].copy()

norespondieron = df[df['ITF'] <= 0].copy()

respondieron['ingreso_necesario'] = respondieron['ad_equiv_hogar'] * 132853.3

respondieron['pobre'] = np.where(respondieron['ingreso_necesario'] > respondieron['ITF'], 1, 0)

test = respondieron[['CODUSU','CH04', 'CH06', 'adulto_equiv', 'ad_equiv_hogar' ,'PONDIH', 'ITF', 'ingreso_necesario', 'pobre']]



In [None]:
# 9. PONDIH
test = norespondieron[['CODUSU','CH04', 'CH06', 'PONDIH', 'ITF']]

hogar = respondieron.drop_duplicates(subset=['CODUSU'])

total_hogares = hogar.shape[0]

hogares_pobres = hogar['pobre'].sum()

proporcion_hogares_pobres = hogares_pobres/total_hogares

print('Proporcion de hogares pobres:', proporcion_hogares_pobres)



In [None]:
## Parte II ##

# 1. Funcion para evaluar metodo

def evaluar_metodo(modelo, X_train, X_test, y_train, y_test):
    
    modelo.fit(X_train, y_train)
    
    # Realizamos predicción sobre base test
    y_pred = modelo.predict(X_test)
    
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Matriz de Confusión:\n", conf_matrix)

    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)

    print("AUC del ROC:", roc_auc)  # AUC of ROC curve

    accuracy = accuracy_score(y_test, y_pred)
    print("Precisión (Accuracy):", accuracy)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='Curva ROC (área = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Tasa de Falsos Positivos (FPR)')
    plt.ylabel('Tasa de Verdaderos Positivos (TPR)')
    plt.title('Curva ROC para KNN')
    plt.legend(loc="lower right")
    plt.show()

    
    accuracy = accuracy_score(y_test, y_pred)
    return {"confusion_matrix": conf_matrix, "accuracy": accuracy, "roc_auc": roc_auc}




In [None]:
# 2. Cross Validation

def cross_validation(modelo, k, X, y):
    kf = KFold(n_splits=k)
    resultados = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        resultado_iteracion = evaluar_metodo(modelo, X_train, X_test, y_train, y_test)
        resultados.append(resultado_iteracion)
    
    return resultados




In [None]:
# 3. Evalua Config
















In [None]:
## Parte III ##


# 1. Pre procesamientio de base 
cols_ingreso = ['CODUSU', 'ITF', 'IPCF', 'PP06C', 'PP06D', 'PP08D1',  'P21', 'TOT_P12', 
                'P47T', 'V2_M', 'V3_M', 'V4_M', 'V5_M', 'V8_M',
                'V9_M', 'V10_M', 'V11_M', 'V12_M', 'V18_M', 'V21_M', 'T_VI', 
                'adulto_equiv', 'ad_equiv_hogar', 'CH05', 'PP09A_ESP',
                'DECINDR', 'ADECINDR', 'RDECINDR', 'PDECINDR', 'GDECINDR', 'IDECINDR',
                'DECOCUR','ADECOCUR','RDECOCUR','PDECOCUR','GDECOCUR','IDECOCUR',
                'DECIFR','ADECIFR','RDECIFR','PDECIFR','GDECIFR','IDECIFR','DECCFR',
                'ADECCFR','RDECCFR','PDECCFR','GDECCFR','IDECCFR'
                ]

df_res = respondieron.drop(columns = cols_ingreso + ['ingreso_necesario'], errors='ignore')
df_res = df_res.dropna(axis=1)
df_nores = norespondieron.drop(columns = cols_ingreso, errors = 'ignore')
df_nores = df_nores.dropna(axis=1)

X = df_res.drop(columns=['pobre'])  
X['intercept'] = 1 
y = df_res['pobre'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)


knn = KNeighborsClassifier(n_neighbors=3)

knn_test = evaluar_metodo(knn, X_train, X_test, y_train, y_test)

scaler = StandardScaler()
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

logit = LogisticRegression(penalty=None, max_iter=1000)

logit_test = evaluar_metodo(logit, X_train_scaled, X_test_scaled, y_train, y_test)


results_cross_validation = cross_validation(knn, 5, X.values, y.values)




### FIJATE SI TENES QUE ESCALAR LOS DATOS ACA O EN LA FUNCION ##