In [1]:
# codigo concentrado del proyecto

In [2]:
# Importamos librerias
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set()

In [47]:
def estandarizacion_informacion(data):
    # eliminamos los NA que determinamos
    df = data
    df = df.dropna(subset=['Electrical'])
    # rellenamos los NA con 0
    df = df.fillna(0)

    
    # Funcion para obtener si la casa ah sido remodelada o no
    df['Remodelacion'] = df.YearRemodAdd == df.YearBuilt
    df['Remodelacion'] = df['Remodelacion'].astype('str')
    df['Remodelacion'] = df['Remodelacion'].replace({'True':0,'False':1})

    # Funcion para la obtencion de los años de construccion de la casa y garage
    df['AñosConstCasa'] = 2021 - df['YearBuilt'] 
    df['AñosConstGar']= 2021 - df['GarageYrBlt']
    df['AñosConstGar'] = df['AñosConstGar'].replace({2021:0})

        
    # Obtencion de dummies
    dummies = ['MSZoning','Street','Alley','LandContour','LotConfig','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation','Heating','CentralAir','Electrical','GarageType','PavedDrive','MiscFeature','SaleType','SaleCondition']
    df = pd.get_dummies(data=df,columns=dummies)

    # Tratamiento de datos de calificacion a numero
    # Revisando las descripciones en el diccionario, vemos que para todas estas variables califican alguna de las caracteristicas de la casa
    # con lo que nos permite asignarle una calificacion de menor a mayor, dependiendo de cada una de las variables
    columnas_valores_reempl = ['BsmtFinType2','HeatingQC','KitchenQual','Functional','FireplaceQu','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','LotShape','Utilities','LandSlope','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1']
    diccionario = {'Reg':3,'IR1':2,'IR2':1,'IR3':0,'Typ':8,'Min1':7,'Min2':6,'Mod':5,'Maj1':4,'Maj2':3,'Sev':2,'Sal':1,'AllPub':4,'NoSewr':3,'NoSeWa':2,'ELO':1,'Gtl':2,'Mod':1,'Sev':0,'Fin':3,'RFn':2,'Unf':1,'NA':0,'Gd':4,'Av':3,'Mn':2,'No':1,'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0,'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1}

    for columna in columnas_valores_reempl:
        df[str(columna)] = df[str(columna)].replace(diccionario)

    # Asegurarnos de que los datos estén en formato numero
    for i in df.columns:
        df[i].astype('int64')
    
    # Reduccion del numero de columnas
    # Baño:
    # para reducir un poco el numero de columnas, haremos algunas agrupaciones
    # Sumaremos el numero total de baños completos
    df['TotBaños'] = df['BsmtFullBath']+df['FullBath']
    # Sumaremos el numero total de medios baños
    df['TotMedBaño']=df['BsmtHalfBath']+df['HalfBath']
    # El sotano tiene varias medidas, por lo que trataremos de crear una sola variable que las pueda explicar todas
    # vamos a multiplicar la calificion del sotano por la de tamaño correspondiente
    df['Bsmt1Cal'] = df['BsmtFinType1']*df['BsmtFinSF1']
    df['Bsmt2Cal'] = df['BsmtFinType2']*df['BsmtFinSF2']
    # por ultimo, vamos a sacar un promedio de calificacion del sotano, dividiendolo entre el total de tamaño del sotano
    df['BsmtCalTot'] = ((df['Bsmt1Cal']+df['Bsmt2Cal']+df['BsmtQual']+df['BsmtCond']+df['BsmtExposure'])/5)/(df['TotalBsmtSF'])

    # Garage:
    # Ahora Trabajamos con la de Garage
    df['GaragCal'] = ((df['GarageFinish']+df['GarageQual']+df['GarageCond'])/3)/df['GarageArea']

    # Cocina:
    # Creamos la calificacion de la cocina
    df['KitchCal'] = df['KitchenAbvGr']*df['KitchenQual']

    # Creamos la calificacion de Chimenea
    df['FireplCal'] = df['Fireplaces']+df['FireplaceQu']

    # Creamos la calificacion de la Alberca
    df['PoolCal'] = (df['PoolArea']*df['PoolQC'])/2
    
    # Nos aseguramos de no tener valores nulos
    df['BsmtCalTot'] = df['BsmtCalTot'].replace(np.nan,0)
    df['GaragCal']=df['GaragCal'].replace(np.nan,0)
    
    # Pendiente eliminar estos datos
    #['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','TotalBsmtSF']
    #Columnas a eliminar restantes de todos los procesos anteriores.
    df = df.drop(['Id','KitchenAbvGr','PoolArea','PoolQC','KitchenQual','Fireplaces','FireplaceQu','GarageFinish','GarageQual','GarageCond','GarageArea','MSSubClass','MoSold','YrSold','YearRemodAdd','YearBuilt','GarageYrBlt','BsmtFullBath','FullBath','BsmtHalfBath','HalfBath','TotRmsAbvGrd','BsmtUnfSF','LowQualFinSF','GrLivArea'],axis=1)
    
    return df

def regresion_lineal(df):
    # generamos las variables
    
    # Pendiente separar los datos
    y = df['SalePrice']
    x1 = df.drop('SalePrice',axis=1)

    # Agregamos la constante
    X = sm.add_constant(x1)

    # Generamos el modelo
    linreg_stats = sm.OLS(y, X).fit()

    return linreg_stats.summary()

In [48]:
df = pd.read_csv('train.csv')


In [49]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [50]:
df = estandarizacion_informacion(df)

In [51]:
df

Unnamed: 0,LotFrontage,LotArea,LotShape,Utilities,LandSlope,OverallQual,OverallCond,MasVnrArea,ExterQual,ExterCond,...,SaleCondition_Partial,TotBaños,TotMedBaño,Bsmt1Cal,Bsmt2Cal,BsmtCalTot,GaragCal,KitchCal,FireplCal,PoolCal
0,65.0,8450,3,4,2,7,5,196.0,3,2,...,0,3,1,4236,0,0.991121,0.003650,3,0,0.0
1,80.0,9600,3,4,2,6,8,0.0,2,2,...,0,2,1,4890,0,0.776228,0.004348,2,3,0.0
2,68.0,11250,2,4,2,7,5,162.0,3,2,...,0,3,1,2916,0,0.635435,0.003289,3,3,0.0
3,60.0,9550,2,4,2,7,5,0.0,2,2,...,0,2,0,1080,0,0.287302,0.002596,3,4,0.0
4,84.0,14260,2,4,2,8,5,350.0,3,2,...,0,3,1,3930,0,0.687860,0.002392,3,3,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917,3,4,2,6,5,0.0,2,2,...,0,2,1,0,0,0.001259,0.004348,2,3,0.0
1456,85.0,13175,3,4,2,6,6,119.0,2,2,...,0,3,0,3950,489,0.576524,0.003333,2,4,0.0
1457,66.0,9042,3,4,2,7,9,0.0,4,3,...,0,2,0,1650,0,0.287500,0.007937,3,5,0.0
1458,68.0,9717,3,4,2,5,6,0.0,2,2,...,0,2,0,294,3087,0.628386,0.006944,3,0,0.0


In [52]:
regresion_lineal(df)

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.918
Model:,OLS,Adj. R-squared:,0.906
Method:,Least Squares,F-statistic:,74.26
Date:,"Fri, 22 Jan 2021",Prob (F-statistic):,0.0
Time:,14:36:39,Log-Likelihood:,-16707.0
No. Observations:,1459,AIC:,33800.0
Df Residuals:,1267,BIC:,34810.0
Df Model:,191,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.455e+04,1.1e+04,-4.968,0.000,-7.61e+04,-3.3e+04
LotFrontage,16.1453,23.751,0.680,0.497,-30.450,62.740
LotArea,0.5568,0.095,5.882,0.000,0.371,0.742
LotShape,-624.8191,1452.367,-0.430,0.667,-3474.129,2224.491
Utilities,1.882e+04,1.36e+04,1.381,0.168,-7918.944,4.56e+04
LandSlope,-1100.1575,3640.458,-0.302,0.763,-8242.147,6041.833
OverallQual,8176.4507,1056.764,7.737,0.000,6103.252,1.02e+04
OverallCond,5388.6833,848.331,6.352,0.000,3724.394,7052.972
MasVnrArea,32.1318,6.023,5.335,0.000,20.315,43.949

0,1,2,3
Omnibus:,443.139,Durbin-Watson:,1.89
Prob(Omnibus):,0.0,Jarque-Bera (JB):,10635.21
Skew:,0.849,Prob(JB):,0.0
Kurtosis:,16.117,Cond. No.,1.35e+17
