# Desafío 2 Regresiones desde Machine Learning
Autor: Walther Becks

In [4]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [3]:
plt.style.use('seaborn') # gráficos estilo seaborn
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams["figure.dpi"] = 200

## Desafío 1: Prepare el ambiente de trabajo

In [6]:
df = pd.read_csv('boston.csv')
df = df.drop('Unnamed: 0', axis=1)
df.describe()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


## Desafío 2: División de la muestra

In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    df.loc[:,'crim' : 'lstat']
    ,df.loc[:,'medv']
    ,test_size=.33
    ,random_state=11138)

In [8]:
x_test.shape

(167, 13)

In [9]:
x_test.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
284,0.00906,90.0,2.97,0,0.4,7.088,20.8,7.3073,1,285,15.3,394.72,7.85
135,0.55778,0.0,21.89,0,0.624,6.335,98.2,2.1107,4,437,21.2,394.67,16.96
346,0.06162,0.0,4.39,0,0.442,5.898,52.3,8.0136,3,352,18.8,364.61,12.67
163,1.51902,0.0,19.58,1,0.605,8.375,93.9,2.162,5,403,14.7,388.45,3.32
303,0.1,34.0,6.09,0,0.433,6.982,17.7,5.4917,7,329,16.1,390.43,4.86


In [10]:
y_train.shape

(339,)

In [11]:
y_train.head()

75     21.4
169    22.3
164    22.7
225    50.0
42     25.3
Name: medv, dtype: float64

## Desafío 3: Generación de modelos

In [12]:
train_1 = linear_model.LinearRegression(fit_intercept=True,normalize=True)
train_2 = linear_model.LinearRegression(fit_intercept=False,normalize=False)

In [13]:
train_1.fit(x_train,y_train)
train_2.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None,
         normalize=False)

In [14]:
train_1_yhat = train_1.predict(x_test)
train_2_yhat = train_2.predict(x_test)

## Desafío 4: Obtención de métricas

In [19]:
def report_scores(model, y_test):
    print(f"Error cuadrático promedio: {mean_squared_error(y_test,model).round(1)}")
    print(f"R2: {r2_score(y_test,model).round(3)}")

In [20]:
report_scores(train_1_yhat,y_test)

Error cuadrático promedio: 27.6
R2: 0.676


In [21]:
report_scores(train_2_yhat,y_test)

Error cuadrático promedio: 30.0
R2: 0.648


## Desafío 5: Refactorización del modelo


In [54]:
def fetch_features(df, excluded_variable='medv'):
    cols = df.columns
    attr_name = []
    pearson_r = []
    for i in cols:
        if i != excluded_variable:
            attr_name.append(i)
            pearson_r.append(df[i].corr(df[excluded_variable]))
            feat = pd.DataFrame({'attr': attr_name,
                                'abscore': list(map(lambda x: abs(x), pearson_r))})
            feat = feat.set_index('attr')
            feat = feat.sort_values(by='abscore', ascending=False)

    return feat

In [59]:
fetch_features(df).iloc[0:6,:]

Unnamed: 0_level_0,abscore
attr,Unnamed: 1_level_1
lstat,0.737663
rm,0.69536
ptratio,0.507787
indus,0.483725
tax,0.468536
nox,0.427321


In [35]:
q = df.corr()['medv'].rename_axis('vars').reset_index(name = 'corr')
q['abs_corr'] = q['corr'].apply(lambda x: abs(x))
q.sort_values(by='abs_corr',ascending=False,inplace=True)
q.iloc[1:,]

Unnamed: 0,vars,corr,abs_corr
12,lstat,-0.737663,0.737663
5,rm,0.69536,0.69536
10,ptratio,-0.507787,0.507787
2,indus,-0.483725,0.483725
9,tax,-0.468536,0.468536
4,nox,-0.427321,0.427321
0,crim,-0.388305,0.388305
8,rad,-0.381626,0.381626
6,age,-0.376955,0.376955
1,zn,0.360445,0.360445


## Desafío 6: Refactorización del modelo predictivo

In [79]:
vars_x = fetch_features(df).iloc[0:6].index.tolist()
vars_x

['lstat', 'rm', 'ptratio', 'indus', 'tax', 'nox']

In [80]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(df.loc[:,vars_x]
                                                       ,df.loc[:,'medv']
                                                       ,test_size=.33
                                                       ,random_state = 11238)

In [81]:
retrain_1 = linear_model.LinearRegression(fit_intercept=True,normalize=True)
retrain_1.fit(x_train2,y_train2)
retrain_1_yhat = retrain_1.predict(x_test2)

In [82]:
report_scores(retrain_1_yhat, y_test2)

Error cuadrático promedio: 37.5
R2: 0.512


## Desafío 7: Predicción de casos

In [83]:
worst_neighbor = np.array([37.9, 12.6, 3.5, 27.7, 187, 0.87]).reshape(1,-1)
best_neighbor = np.array([1.73, 22, 8.7, 0.46, 711, 0.38]).reshape(1,-1)

In [84]:
retrain_1.predict(worst_neighbor)

array([59.97435265])

In [85]:
retrain_1.predict(best_neighbor)

array([118.55978502])

In [90]:
print("El peor vecindario tiene una predicción de un precio que es aproximadamente la mitad del mejor barrio")

El peor vecindario tiene una predicción de un precio que es aproximadamente la mitad del mejor barrio
