In [13]:
import pandas as pd
import statsmodels.api as sm
from dmba import stepwise_selection, AIC_score
from sklearn.linear_model import LinearRegression

In [3]:
data = pd.read_csv("./datasets/insurance.csv")
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [4]:
cleaned_data = pd.get_dummies(data, drop_first=True)

cleaned_data

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,1
1,18,33.770,1,1725.55230,1,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.880,0,3866.85520,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,1


In [5]:
cleaned_data = cleaned_data.rename(columns={'sex_male':'sex','smoker_yes':'smoker'})

In [6]:
cleaned_data.to_csv("./datasets/insurance_cleaned.csv", index=False)

In [7]:
outcome = 'charges'
predictors = ['age','bmi','children','smoker']

insurance_full_lm = sm.OLS(cleaned_data[outcome],cleaned_data[predictors])
results = insurance_full_lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,charges,R-squared (uncentered):,0.872
Model:,OLS,Adj. R-squared (uncentered):,0.872
Method:,Least Squares,F-statistic:,2277.0
Date:,"Wed, 31 Aug 2022",Prob (F-statistic):,0.0
Time:,19:04:45,Log-Likelihood:,-13629.0
No. Observations:,1338,AIC:,27270.0
Df Residuals:,1334,BIC:,27290.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,197.6732,11.589,17.058,0.000,174.939,220.407
bmi,28.0102,15.948,1.756,0.079,-3.276,59.296
children,240.5490,144.750,1.662,0.097,-43.414,524.512
smoker,2.331e+04,433.801,53.732,0.000,2.25e+04,2.42e+04

0,1,2,3
Omnibus:,277.987,Durbin-Watson:,2.07
Prob(Omnibus):,0.0,Jarque-Bera (JB):,640.617
Skew:,1.14,Prob(JB):,7.7899999999999995e-140
Kurtosis:,5.509,Cond. No.,126.0


## Cross validation
Se refiere a validar nuestro model en base a una parte de los datos (muestra) que no se usa para el entrenamiento.

El algoritmo más común es:
- K-fold cross validation

Se utiliza cuando hay muchos registros y también cuando existe incertidumbre.

Reto o actividad: Implementar k-fold

## Model selection y stepwise regression


In [8]:
car_data = pd.read_csv("./datasets/carprice.csv")
car_data = car_data.set_index("car_ID")

predictors = [
    #'car_ID', 
    'symboling', 
    #'CarName', 
    'fueltype', 
    'aspiration',
    'doornumber', 
    'carbody', 
    'drivewheel', 
    'enginelocation', 
    'wheelbase',
    'carlength', 
    'carwidth', 
    'carheight', 
    'curbweight', 
    'enginetype',
    'cylindernumber', 
    'enginesize', 
    'fuelsystem', 
    'boreratio', 
    'stroke',
    'compressionratio', 
    'horsepower',
    'peakrpm', 
    'citympg', 
    'highwaympg'
]

outcome = "price"


car_data['cylindernumber']=car_data['cylindernumber'].replace({'four':4,'six':6,'five':5,'three':4,'twelve':12,'two':2,'eight':8})
car_data['doornumber']=car_data['doornumber'].replace({'four':4,'two':2})

#car_data['enginetype']=car_data['enginetype'].replace({'dohc':1, 'ohcv':2, 'ohc':3, 'l':4, 'rotor':5, 'ohcf':6, 'dohcv':7})

#car_data['fuelsystem']=car_data['fuelsystem'].replace({'mpfi':1, '2bbl':2, 'mfi':3, '1bbl':4, 'spfi':5, '4bbl':6, 'idi':7, 'spdi':8})

#car_data['carbody']=car_data['carbody'].replace({'convertible':1, 'hatchback':2, 'sedan':3, 'wagon':4, 'hardtop':5})

# Reemplazar los valores
cleaned_car_data = pd.get_dummies(car_data[predictors], drop_first=True)
cleaned_car_data[outcome] = car_data[outcome]
cleaned_car_data = cleaned_car_data.rename(columns={"fueltype_gas":"fueltype","aspiration_turbo":"turbo"})
#cleaned_car_data
cleaned_car_data

Unnamed: 0_level_0,symboling,doornumber,wheelbase,carlength,carwidth,carheight,curbweight,cylindernumber,enginesize,boreratio,...,enginetype_ohcv,enginetype_rotor,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi,price
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,2,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,1,0,0,13495.0
2,3,2,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,1,0,0,16500.0
3,1,2,94.5,171.2,65.5,52.4,2823,6,152,2.68,...,1,0,0,0,0,0,1,0,0,16500.0
4,2,4,99.8,176.6,66.2,54.3,2337,4,109,3.19,...,0,0,0,0,0,0,1,0,0,13950.0
5,2,4,99.4,176.6,66.4,54.3,2824,5,136,3.19,...,0,0,0,0,0,0,1,0,0,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201,-1,4,109.1,188.8,68.9,55.5,2952,4,141,3.78,...,0,0,0,0,0,0,1,0,0,16845.0
202,-1,4,109.1,188.8,68.8,55.5,3049,4,141,3.78,...,0,0,0,0,0,0,1,0,0,19045.0
203,-1,4,109.1,188.8,68.9,55.5,3012,6,173,3.58,...,1,0,0,0,0,0,1,0,0,21485.0
204,-1,4,109.1,188.8,68.9,55.5,3217,6,145,3.01,...,0,0,0,0,1,0,0,0,0,22470.0


In [9]:
cleaned_car_data.columns

Index(['symboling', 'doornumber', 'wheelbase', 'carlength', 'carwidth',
       'carheight', 'curbweight', 'cylindernumber', 'enginesize', 'boreratio',
       'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg',
       'highwaympg', 'fueltype', 'turbo', 'carbody_hardtop',
       'carbody_hatchback', 'carbody_sedan', 'carbody_wagon', 'drivewheel_fwd',
       'drivewheel_rwd', 'enginelocation_rear', 'enginetype_dohcv',
       'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv',
       'enginetype_rotor', 'fuelsystem_2bbl', 'fuelsystem_4bbl',
       'fuelsystem_idi', 'fuelsystem_mfi', 'fuelsystem_mpfi',
       'fuelsystem_spdi', 'fuelsystem_spfi', 'price'],
      dtype='object')

In [10]:
predictors = cleaned_car_data.columns
predictors = predictors.drop("price")
outcome = "price"

car_lm = sm.OLS(cleaned_car_data[outcome],cleaned_car_data[predictors])

results = car_lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.926
Model:,OLS,Adj. R-squared:,0.91
Method:,Least Squares,F-statistic:,56.45
Date:,"Wed, 31 Aug 2022",Prob (F-statistic):,1.82e-76
Time:,19:04:45,Log-Likelihood:,-1865.6
No. Observations:,205,AIC:,3807.0
Df Residuals:,167,BIC:,3934.0
Df Model:,37,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
symboling,344.4416,249.763,1.379,0.170,-148.658,837.542
doornumber,153.2709,315.604,0.486,0.628,-469.817,776.358
wheelbase,94.6250,104.395,0.906,0.366,-111.480,300.730
carlength,-47.0405,52.770,-0.891,0.374,-151.223,57.142
carwidth,612.8443,232.329,2.638,0.009,154.164,1071.524
carheight,270.2873,135.872,1.989,0.048,2.039,538.536
curbweight,4.0771,1.817,2.244,0.026,0.490,7.664
cylindernumber,1186.2134,691.166,1.716,0.088,-178.335,2550.762
enginesize,125.9415,27.158,4.637,0.000,72.324,179.559

0,1,2,3
Omnibus:,52.601,Durbin-Watson:,1.482
Prob(Omnibus):,0.0,Jarque-Bera (JB):,171.966
Skew:,1.017,Prob(JB):,4.55e-38
Kurtosis:,6.999,Cond. No.,782000.0


In [14]:
y = cleaned_car_data[outcome]
X = cleaned_car_data[predictors]

In [17]:
def train_model(variables):
    if len(variables)==0:
        return None
    
    model = LinearRegression()
    model.fit(X[variables],y)
    return model

def score_model(model, variables):
    if len(variables)==0:
        return AIC_score(y,[y.mean()]*len(y), model, df=1)
    
    return AIC_score(y, model.predict(X[variables]), model)


best_model, best_variables = stepwise_selection(X.columns,train_model,score_model,verbose=True)

Variables: symboling, doornumber, wheelbase, carlength, carwidth, carheight, curbweight, cylindernumber, enginesize, boreratio, stroke, compressionratio, horsepower, peakrpm, citympg, highwaympg, fueltype, turbo, carbody_hardtop, carbody_hatchback, carbody_sedan, carbody_wagon, drivewheel_fwd, drivewheel_rwd, enginelocation_rear, enginetype_dohcv, enginetype_l, enginetype_ohc, enginetype_ohcf, enginetype_ohcv, enginetype_rotor, fuelsystem_2bbl, fuelsystem_4bbl, fuelsystem_idi, fuelsystem_mfi, fuelsystem_mpfi, fuelsystem_spdi, fuelsystem_spfi
Start: score=4268.94, constant
Step: score=3974.82, add enginesize
Step: score=3948.19, add drivewheel_rwd
Step: score=3927.72, add enginelocation_rear
Step: score=3879.82, add carwidth
Step: score=3871.14, add peakrpm
Step: score=3867.03, add enginetype_ohcv
Step: score=3860.03, add stroke
Step: score=3849.06, add boreratio
Step: score=3836.56, add enginetype_rotor
Step: score=3828.80, add turbo
Step: score=3820.20, add enginetype_ohc
Step: score=

In [18]:
best_variables

['enginesize',
 'drivewheel_rwd',
 'enginelocation_rear',
 'carwidth',
 'peakrpm',
 'enginetype_ohcv',
 'stroke',
 'boreratio',
 'enginetype_rotor',
 'turbo',
 'enginetype_ohc',
 'carbody_hatchback',
 'fuelsystem_spdi',
 'carbody_hardtop',
 'carbody_wagon',
 'carbody_sedan',
 'carheight',
 'enginetype_ohcf',
 'enginetype_dohcv']

In [19]:
best_model_stats = sm.OLS(cleaned_car_data[outcome],cleaned_car_data[best_variables])

results = best_model_stats.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.974
Model:,OLS,Adj. R-squared (uncentered):,0.971
Method:,Least Squares,F-statistic:,365.8
Date:,"Wed, 31 Aug 2022",Prob (F-statistic):,2.7099999999999998e-136
Time:,19:17:15,Log-Likelihood:,-1894.8
No. Observations:,205,AIC:,3828.0
Df Residuals:,186,BIC:,3891.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
enginesize,198.0054,9.867,20.068,0.000,178.540,217.471
drivewheel_rwd,1928.1551,646.194,2.984,0.003,653.344,3202.967
enginelocation_rear,8003.4700,2178.048,3.675,0.000,3706.618,1.23e+04
carwidth,153.0289,119.127,1.285,0.201,-81.985,388.043
peakrpm,1.1579,0.450,2.571,0.011,0.269,2.047
enginetype_ohcv,-4760.5481,1141.113,-4.172,0.000,-7011.736,-2509.360
stroke,-5033.6682,811.217,-6.205,0.000,-6634.037,-3433.299
boreratio,-5349.1112,1171.042,-4.568,0.000,-7659.342,-3038.880
enginetype_rotor,1.183e+04,1669.249,7.089,0.000,8540.703,1.51e+04

0,1,2,3
Omnibus:,30.256,Durbin-Watson:,1.489
Prob(Omnibus):,0.0,Jarque-Bera (JB):,202.812
Skew:,0.122,Prob(JB):,9.119999999999999e-45
Kurtosis:,7.867,Cond. No.,82900.0


In [None]:
# Retirar fuel_system, 