In [42]:
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import pandas as pd

# 1

In [27]:
X = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90]).reshape(-1, 1)
Y = np.array([420, 365, 285, 220, 176, 117, 69, 34, 5]).reshape(-1, 1)

De la librearía scikit-learn usamos LinearRegression, por minímos cuadrados tenemos que 

In [29]:
mls = LinearRegression()
mls.fit(X, Y)
print("beta_0 = ", mls.coef_)
print("beta_1 = ", mls.intercept_)

beta_0 =  [[-5.31333333]]
beta_1 =  [453.55555556]


Usando la librería statsmodels, tenemos que los intervalos de confianza son

In [30]:
X = sm.add_constant(X) 
model = sm.OLS(Y, X).fit()

intervalos_confianza = model.conf_int(alpha=0.05)  # 95% de confianza
print(intervalos_confianza)

[[419.51998893 487.59112219]
 [ -5.91816122  -4.70850544]]


# 2

In [31]:
X = np.array([2, 4, 6, 8, 10]).reshape(-1, 1)
Y = np.array([11.5, 10.2, 10.3, 9.68, 9.32]).reshape(-1, 1)

## (a)

In [33]:
mls = LinearRegression()
mls.fit(X, Y)
print("beta_0 = ", mls.coef_)
print("beta_1 = ", mls.intercept_)

beta_0 =  [[-0.244]]
beta_1 =  [11.664]


Utilizando la clase LinearRegresión de la librería scikit-learn

Obtenemos las predicciones $\hat{y}_i$

In [34]:
Y_pred = mls.predict(X)
print(Y_pred)

[[11.176]
 [10.688]
 [10.2  ]
 [ 9.712]
 [ 9.224]]


Calculando los residuos $E_i$

In [35]:
E = Y - Y_pred
print(E)

[[ 0.324]
 [-0.488]
 [ 0.1  ]
 [-0.032]
 [ 0.096]]


Calculando la desviación entadar de los residuos $\sigma$

In [36]:
E.std()

np.float64(0.2695774471279078)

## (b)

En clase vimos que $R^2$ se interpreta como la fracción de la varianza de la variable dependiente que es explicada por la variable independiente. Usando la función score de la clase LinearRegresion calculamos el $R^2$.

In [40]:
r2 = mls.score(X, Y)
r2

0.8676187700378899

# 3

## (a) y (b)

Utilizamos pandas para extraer los datos

In [46]:
df = pd.read_csv('./Prestige.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,education,income,women,prestige,census,type
0,gov.administrators,13.11,12351,11.16,68.8,1113,prof
1,general.managers,12.26,25879,4.02,69.1,1130,prof
2,accountants,12.77,9271,15.7,63.4,1171,prof
3,purchasing.officers,11.42,8865,9.11,56.8,1175,prof
4,chemists,14.62,8403,11.68,73.5,2111,prof


Usamos la función describe para saber con que columnas se puede calcular la regresión

In [47]:
df.describe()

Unnamed: 0,education,income,women,prestige,census
count,102.0,102.0,102.0,102.0,102.0
mean,10.738039,6797.901961,28.97902,46.833333,5401.77451
std,2.728444,4245.922227,31.724931,17.204486,2644.993215
min,6.38,611.0,0.0,14.8,1113.0
25%,8.445,4106.0,3.5925,35.225,3120.5
50%,10.54,5930.5,13.6,43.6,5135.0
75%,12.6475,8187.25,52.2025,59.275,8312.5
max,15.97,25879.0,97.51,87.2,9517.0


Vemos que las columnas 'education', 'income', 'women', 'census' son las numéricas

In [51]:
Y = df['prestige'].array.reshape(-1, 1)
for col in ['education', 'income', 'women', 'census']:
    X = df[col].array.reshape(-1, 1)
    mls = LinearRegression()
    mls.fit(X, Y)
    r2 = mls.score(X, Y)
    print(col, 'R^2 = ', r2)

education R^2 =  0.72280074071565
income R^2 =  0.5110901032277888
women R^2 =  0.014002980447122004
census R^2 =  0.4026032968245885


## (b)

Al ser el $R^2$ mayor que el de todas las demás variables, la variable mas correlacionada es la de 'education'