In [189]:
# Importamos las librerias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_validate

from sklearn.impute import SimpleImputer

In [190]:
# Esta linea permite que los graficos sean renderizados directamente en nuestro Notebook
## Carguemos en un pandas dataframe nuestra base de datos
%matplotlib inline

# 1 Importar los datasets

In [191]:
train = pd.read_csv('../../house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../../house-prices-advanced-regression-techniques/test.csv')
y_test = pd.read_csv('../../house-prices-advanced-regression-techniques/sample_submission.csv')

In [192]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [193]:
train["train"] = 1
y_train = train["SalePrice"]
test["train"] = 0 

In [194]:
combined_datasets = pd.concat([train, test], axis=0)

In [195]:
print(train.shape)
print(test.shape)
print(combined_datasets.shape)

(1460, 82)
(1459, 81)
(2919, 82)


## Data processing

In [196]:
def clean_null_values(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    
    #dealing with missing data
    df = df.drop((missing_data[missing_data['Total'] > 3]).index,1)

    df.isnull().sum().max() #just checking that there's no missing data missing...
    
    return df

def transform_numeric_and_categoric_values(df, important_features):
    num = df.dtypes != "object"
    num_cols = [c for c in num.index if num[c]]

    num_train = df[num_cols].copy()
    aux_train = num_train["train"]
    num_train = num_train[important_features]
    num_train = np.log1p(num_train)
    num_train["train"] = aux_train
    
    cat = df.dtypes == "object"
    
    cat_cols = [c for c in cat.index if cat[c]]
    cat_cols
    
    cat_train = df[cat_cols]
    cat_train_dummies = pd.get_dummies(cat_train, drop_first=True)
    df = pd.concat([num_train, cat_train_dummies], axis = 1)
    return df

def imputer_for_null_values(df):
    imputer = SimpleImputer(strategy="median")
    imputer.fit(df)
    X_trans = imputer.transform(df)
    return X_trans

In [197]:
# getted with function
important_features = ['LotArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'HalfBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF']

In [198]:
combined_datasets = clean_null_values(combined_datasets)
combined_datasets = transform_numeric_and_categoric_values(combined_datasets,important_features)

In [199]:
train_df = combined_datasets[combined_datasets["train"] == 1]
test_df = combined_datasets[combined_datasets["train"] == 0]
train_df.drop(["train"], axis=1, inplace=True)
test_df.drop(["train"], axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [200]:
train_df = imputer_for_null_values(train_df)
test_df = imputer_for_null_values(test_df)

In [201]:
from sklearn.ensemble import GradientBoostingRegressor
ensemble = GradientBoostingRegressor()
estimator = GradientBoostingRegressor(max_depth=8, min_samples_leaf=50,
                          min_samples_split=500, n_estimators=360,
                          random_state=10, subsample=0.8)
estimator.fit(train_df, y_train)

GradientBoostingRegressor(max_depth=8, min_samples_leaf=50,
                          min_samples_split=500, n_estimators=360,
                          random_state=10, subsample=0.8)

In [202]:
estimator.score(test_df,y_test["SalePrice"])

-18.3183494719876