# Cross Validation

1. Reservar 1/k de los datos como muestra reservada.
2. Entrenar el modelo con los datos restantes
3. Aplicar (puntuar) el modelo a la retención de 1/k y registrar las métricas de evaluación del modelo.
4. Restaurar el primer 1/k de los datos y reservar el siguiente 1/k (excluyendo los registros que se seleccionaron la primera vez)
5. Repetir los pasos 2 y 3
6. Repetir hasta que cada registro se haya utilizado en la parte reservada
7. Promediar o combinar las métricas de evaluación del modelo.

In [9]:
# Reto: Seleccionar las caracteristicas que creen que son más importantes y hacer el proceso de regresión lineal y obtener métricas e interpretar.

In [10]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

from dmba.featureSelection import stepwise_selection

from dmba.metric import AIC_score

In [11]:
csv = pd.read_csv("../Datasets/house_sales.csv",delimiter="\t")
csv.drop(["DocumentDate","ym","PropertyType","SalePrice"], axis=1, inplace=True)

In [12]:
predictors = ['SqFtLot','Bedrooms','SqFtFinBasement','Bathrooms','NewConstruction']
outcome='AdjSalePrice'

In [13]:
csv[predictors]

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,False
2,20156,4,1452,3.75,True
3,26036,4,900,1.75,False
4,8618,5,1640,3.75,False
5,8620,4,0,1.75,False
...,...,...,...,...,...
27057,5468,3,590,1.75,False
27058,23914,4,910,4.50,False
27061,11170,4,0,1.00,False
27062,6223,3,0,2.00,False


In [14]:
data = pd.get_dummies(csv[predictors],drop_first=True)
data

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,False
2,20156,4,1452,3.75,True
3,26036,4,900,1.75,False
4,8618,5,1640,3.75,False
5,8620,4,0,1.75,False
...,...,...,...,...,...
27057,5468,3,590,1.75,False
27058,23914,4,910,4.50,False
27061,11170,4,0,1.00,False
27062,6223,3,0,2.00,False


In [27]:
data['NewConstruction'] = [1 if d else 0 for d in data['NewConstruction']]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['NewConstruction'] = [1 if d else 0 for d in data['NewConstruction']]


Unnamed: 0,PropertyID,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,SqFtLot,SqFtTotLiving,SqFtFinBasement,Bathrooms,Bedrooms,BldgGrade,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
1,1000102,405100,0.930836,300805.0,2,9373,2400,0,3.00,6,7,0,0,70000,229000,98002,0
2,1200013,404400,0.929228,1076162.0,1,20156,3764,1452,3.75,4,10,0,0,203000,590000,98166,1
3,1200019,425600,0.977941,761805.0,1,26036,2060,900,1.75,4,8,0,0,183000,275000,98166,0
4,2800016,418400,0.961397,442065.0,1,8618,3200,1640,3.75,5,7,0,0,104000,229000,98168,0
5,2800024,351600,0.807904,297065.0,1,8620,1720,0,1.75,4,7,0,0,104000,205000,98168,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27057,9842300710,318700,0.732307,443803.0,1,5468,1480,590,1.75,3,7,0,0,201000,172000,98126,0
27058,9845500010,433500,0.996094,1586196.0,1,23914,4720,910,4.50,4,11,0,1,703000,951000,98040,0
27061,9899200010,325300,0.747472,220744.0,1,11170,1070,0,1.00,4,6,0,0,92000,130000,98055,0
27062,9900000355,400600,0.920496,342207.0,1,6223,1345,0,2.00,3,7,0,0,103000,212000,98166,0


In [28]:
# Entrenamiento con los datos
model = sm.OLS(csv[outcome],data.assign(const=1))
results = model.fit()

results.summary()

0,1,2,3
Dep. Variable:,AdjSalePrice,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,3.8330000000000005e+28
Date:,"Thu, 05 May 2022",Prob (F-statistic):,0.0
Time:,18:32:06,Log-Likelihood:,340280.0
No. Observations:,22687,AIC:,-680500.0
Df Residuals:,22670,BIC:,-680400.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
PropertyID,-2.925e-20,1.73e-19,-0.169,0.865,-3.68e-19,3.09e-19
zhvi_px,-8.132e-17,1.36e-14,-0.006,0.995,-2.67e-14,2.66e-14
zhvi_idx,2.486e-18,4.84e-17,0.051,0.959,-9.24e-17,9.74e-17
AdjSalePrice,1.0000,3.56e-15,2.81e+14,0.000,1.000,1.000
NbrLivingUnits,5.325e-11,3.2e-09,0.017,0.987,-6.22e-09,6.33e-09
SqFtLot,2.138e-16,1.77e-14,0.012,0.990,-3.46e-14,3.5e-14
SqFtTotLiving,-4.891e-14,1.31e-12,-0.037,0.970,-2.61e-12,2.51e-12
SqFtFinBasement,4.698e-14,1.37e-12,0.034,0.973,-2.63e-12,2.72e-12
Bathrooms,2.098e-11,1.06e-09,0.020,0.984,-2.07e-09,2.11e-09

0,1,2,3
Omnibus:,6586.968,Durbin-Watson:,0.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24952.166
Skew:,-1.416,Prob(JB):,0.0
Kurtosis:,7.286,Cond. No.,1.18e+21


In [19]:

def train(variables):
    if len(variables)==0:
        return None
    model = LinearRegression()
    model.fit(csv[variables],csv[outcome])
    return model

def score_model(model,variables):
    if len(variables)==0:
        return None # Como obtener el AIC_score cuando no hay variables
    return AIC_score(csv[outcome],model.predict(csv[variables]),model)

model,variables = stepwise_selection(csv.columns,train_model=train,score_model=score_model,verbose=True,direction='backward')

variables

Variables: PropertyID, zhvi_px, zhvi_idx, AdjSalePrice, NbrLivingUnits, SqFtLot, SqFtTotLiving, SqFtFinBasement, Bathrooms, Bedrooms, BldgGrade, YrBuilt, YrRenovated, TrafficNoise, LandVal, ImpsVal, ZipCode, NewConstruction
Start: score=-956257.48, constant
Step: score=-992251.84, remove YrBuilt
Step: score=-992251.84, unchanged None


['PropertyID',
 'zhvi_px',
 'zhvi_idx',
 'AdjSalePrice',
 'NbrLivingUnits',
 'SqFtLot',
 'SqFtTotLiving',
 'SqFtFinBasement',
 'Bathrooms',
 'Bedrooms',
 'BldgGrade',
 'YrRenovated',
 'TrafficNoise',
 'LandVal',
 'ImpsVal',
 'ZipCode',
 'NewConstruction']

In [32]:
data2 = csv[variables]
data2['NewConstruction'] = [1 if d else 0 for d in data2['NewConstruction']]

model = sm.OLS(csv[outcome],data2.assign(const=1))
results = model.fit()

results.summary()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['NewConstruction'] = [1 if d else 0 for d in data2['NewConstruction']]


0,1,2,3
Dep. Variable:,AdjSalePrice,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,3.8330000000000005e+28
Date:,"Thu, 05 May 2022",Prob (F-statistic):,0.0
Time:,18:35:23,Log-Likelihood:,340280.0
No. Observations:,22687,AIC:,-680500.0
Df Residuals:,22670,BIC:,-680400.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
PropertyID,-2.925e-20,1.73e-19,-0.169,0.865,-3.68e-19,3.09e-19
zhvi_px,-8.132e-17,1.36e-14,-0.006,0.995,-2.67e-14,2.66e-14
zhvi_idx,2.486e-18,4.84e-17,0.051,0.959,-9.24e-17,9.74e-17
AdjSalePrice,1.0000,3.56e-15,2.81e+14,0.000,1.000,1.000
NbrLivingUnits,5.325e-11,3.2e-09,0.017,0.987,-6.22e-09,6.33e-09
SqFtLot,2.138e-16,1.77e-14,0.012,0.990,-3.46e-14,3.5e-14
SqFtTotLiving,-4.891e-14,1.31e-12,-0.037,0.970,-2.61e-12,2.51e-12
SqFtFinBasement,4.698e-14,1.37e-12,0.034,0.973,-2.63e-12,2.72e-12
Bathrooms,2.098e-11,1.06e-09,0.020,0.984,-2.07e-09,2.11e-09

0,1,2,3
Omnibus:,6586.968,Durbin-Watson:,0.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24952.166
Skew:,-1.416,Prob(JB):,0.0
Kurtosis:,7.286,Cond. No.,1.18e+21
