# Desafío 1 - Clasificación desde la Econometría
Autor: Walther Becks

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings


In [4]:
plt.style.use('seaborn') # gráficos estilo seaborn
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams["figure.dpi"] = 200

In [3]:
df = pd.read_csv('southafricanheart.csv')
df = df.drop('Unnamed: 0', axis = 1)
df.describe()

Unnamed: 0,sbp,tobacco,ldl,adiposity,typea,obesity,alcohol,age,chd
count,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0,462.0
mean,138.32684,3.635649,4.740325,25.406732,53.103896,26.044113,17.044394,42.816017,0.34632
std,20.496317,4.593024,2.070909,7.780699,9.817534,4.21368,24.481059,14.608956,0.476313
min,101.0,0.0,0.98,6.74,13.0,14.7,0.0,15.0,0.0
25%,124.0,0.0525,3.2825,19.775,47.0,22.985,0.51,31.0,0.0
50%,134.0,2.0,4.34,26.115,53.0,25.805,7.51,45.0,0.0
75%,148.0,5.5,5.79,31.2275,60.0,28.4975,23.8925,55.0,1.0
max,218.0,31.2,15.33,42.49,78.0,46.58,147.19,64.0,1.0


In [5]:
df['famhist'].value_counts()

Absent     270
Present    192
Name: famhist, dtype: int64

In [6]:
df['chd'].value_counts()

0    302
1    160
Name: chd, dtype: int64

## Desafío 2

In [7]:
df['famhist'] = df['famhist'].replace(['Absent','Present'],[0,1])
df['famhist'].value_counts()

0    270
1    192
Name: famhist, dtype: int64

In [8]:
model1 = smf.logit('chd ~ famhist', df).fit()

Optimization terminated successfully.
         Current function value: 0.608111
         Iterations 5


In [14]:
model1.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.057
Dependent Variable:,chd,AIC:,565.8944
Date:,2022-05-20 17:01,BIC:,574.1655
No. Observations:,462,Log-Likelihood:,-280.95
Df Model:,1,LL-Null:,-298.05
Df Residuals:,460,LLR p-value:,4.9371e-09
Converged:,1.0000,Scale:,1.0
No. Iterations:,5.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-1.1690,0.1431,-8.1687,0.0000,-1.4495,-0.8885
famhist,1.1690,0.2033,5.7514,0.0000,0.7706,1.5674


In [10]:
def inverse_logit(x):
    return 1 / (1 + np.exp(-x))

In [13]:
prob_famhist = inverse_logit(model1.params['Intercept'] + model1.params['famhist'])
prob_intercept = inverse_logit(model1.params['Intercept'])

In [30]:
print("- Cuando hay evidencia de antecedentes familiares, existe la probabilidad de tener una enfermedad cardiaca del: ", prob_famhist*100,"%")
print("- Cuando NO hay evidencia de antecedentes familiares, existe la probabilidad de tener una enfermedad cardiaca del: ", prob_intercept.round(2)*100,"%")
print("- Una persona con antecedentes familiares, tiene más probabilidad de tener una enfermedad caridaca en comparación a una persona sin antecedentes del: ",(prob_famhist-prob_intercept.round(2))*100,"%")

- Cuando hay evidencia de antecedentes familiares, existe la probabilidad de tener una enfermedad cardiaca del:  50.0 %
- Cuando NO hay evidencia de antecedentes familiares, existe la probabilidad de tener una enfermedad cardiaca del:  24.0 %
- Una persona con antecedentes familiares, tiene más probabilidad de tener una enfermedad caridaca en comparación a una persona sin antecedentes del:  26.0 %


### Estimación por LPM

In [31]:
model1_lpm = smf.ols('chd ~ famhist',df).fit()
model1_lpm.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.072
Dependent Variable:,chd,AIC:,593.1725
Date:,2022-05-20 17:15,BIC:,601.4437
No. Observations:,462,Log-Likelihood:,-294.59
Df Model:,1,F-statistic:,36.86
Df Residuals:,460,Prob (F-statistic):,2.66e-09
R-squared:,0.074,Scale:,0.2105

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.2370,0.0279,8.4893,0.0000,0.1822,0.2919
famhist,0.2630,0.0433,6.0713,0.0000,0.1778,0.3481

0,1,2,3
Omnibus:,768.898,Durbin-Watson:,1.961
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58.778
Skew:,0.579,Prob(JB):,0.0
Kurtosis:,1.692,Condition No.:,2.0


## Desafío 3: Estimación completa

In [35]:
model2 = smf.logit('chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + obesity + alcohol + age', df).fit()
model2.summary2()

Optimization terminated successfully.
         Current function value: 0.510974
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.208
Dependent Variable:,chd,AIC:,492.14
Date:,2022-05-20 17:17,BIC:,533.4957
No. Observations:,462,Log-Likelihood:,-236.07
Df Model:,9,LL-Null:,-298.05
Df Residuals:,452,LLR p-value:,2.0548e-22
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-6.1507,1.3083,-4.7015,0.0000,-8.7149,-3.5866
sbp,0.0065,0.0057,1.1350,0.2564,-0.0047,0.0177
tobacco,0.0794,0.0266,2.9838,0.0028,0.0272,0.1315
ldl,0.1739,0.0597,2.9152,0.0036,0.0570,0.2909
adiposity,0.0186,0.0293,0.6346,0.5257,-0.0388,0.0760
famhist,0.9254,0.2279,4.0605,0.0000,0.4787,1.3720
typea,0.0396,0.0123,3.2138,0.0013,0.0154,0.0637
obesity,-0.0629,0.0442,-1.4218,0.1551,-0.1496,0.0238
alcohol,0.0001,0.0045,0.0271,0.9784,-0.0087,0.0089


In [55]:
result = model2.summary2().tables[1].sort_values(by='P>|z|', ascending = True)
result[result['P>|z|'] <= 0.005]

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-6.150721,1.30826,-4.701451,3e-06,-8.714863,-3.586578
famhist,0.92537,0.227894,4.06053,4.9e-05,0.478706,1.372034
age,0.045225,0.01213,3.728464,0.000193,0.021451,0.068999
typea,0.039595,0.01232,3.213823,0.00131,0.015448,0.063742
tobacco,0.079376,0.026603,2.983758,0.002847,0.027236,0.131517
ldl,0.173924,0.059662,2.915166,0.003555,0.056989,0.290859


In [39]:
model3 = smf.logit('chd ~ tobacco + ldl + famhist + typea + age', df).fit()
model3.summary2()

Optimization terminated successfully.
         Current function value: 0.514811
         Iterations 6


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.202
Dependent Variable:,chd,AIC:,487.6856
Date:,2022-05-20 17:22,BIC:,512.499
No. Observations:,462,Log-Likelihood:,-237.84
Df Model:,5,LL-Null:,-298.05
Df Residuals:,456,LLR p-value:,2.5537000000000002e-24
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-6.4464,0.9209,-7.0004,0.0000,-8.2513,-4.6416
tobacco,0.0804,0.0259,3.1057,0.0019,0.0297,0.1311
ldl,0.1620,0.0550,2.9470,0.0032,0.0543,0.2697
famhist,0.9082,0.2258,4.0228,0.0001,0.4657,1.3507
typea,0.0371,0.0122,3.0505,0.0023,0.0133,0.0610
age,0.0505,0.0102,4.9442,0.0000,0.0305,0.0705


In [1]:
print("Al eliminar las 4 variables que tienen P<|z| = 0.005 podemos observar que nuestro modelo decrece en un 6% de explicatividad, lo que es un buen trade-off")

Al eliminar las 4 variables que tienen P<|z| = 0.005 podemos observar que nuestro modelo decrece en un 6% de explicatividad, lo que es un buen trade-off


## Desafío 4: Estimación de perfiles

In [40]:
def parammean(param, dataframe=df,model=model3):
    return model.params[param] * dataframe[param].mean()

###  La probabilidad de tener una enfermedad coronaria para un individuo con características similares a la muestra.

In [56]:
pr_mean = model3.params['Intercept'] + parammean('tobacco') + parammean('ldl') + \
parammean('typea') + parammean('age') + parammean('famhist')
inverse_logit(pr_mean)

0.2937092748158695

### La probabilidad de tener una enfermedad coronaria para un individuo con altos niveles de lipoproteína de baja densidad, manteniendo todas las demás características constantes.

In [58]:
pr_ldl_max = model3.params['Intercept'] + parammean('tobacco') + model3.params['ldl']*df['ldl'].max() + \
parammean('typea') + parammean('age') + parammean('famhist')
inverse_logit(pr_ldl_max)

0.6980443104466209

### La probabilidad de tener una enfermedad coronaria para un individuo con bajos niveles de lipoproteína de baja densidad, manteniendo todas las demáscaracterísticas constantes.

In [57]:
pr_ldl_min = model3.params['Intercept'] + parammean('tobacco') + model3.params['ldl']*df['ldl'].min() + \
parammean('typea') + parammean('age') + parammean('famhist')
inverse_logit(pr_ldl_min)

0.18443595575404642

In [2]:
print("Podemos observar que altos niveles de lopoproteína de baja densidad muestra una probabilidad del 70% en la explicación vs un 29% con individuos con caracteristicas similares a la muestra, lo que hace esta variable crítica en el desempeño del modelo")

Podemos observar que altos niveles de lopoproteína de baja densidad muestra una probabilidad del 70% en la explicación vs un 29% con individuos con caracteristicas similares a la muestra, lo que hace esta variable crítica en el desempeño del modelo
