In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#--------------------------------------------------
# Create some syntetic data
size = 10000

Year = np.random.randint(1998, high=2021, size=size, dtype=int)
LotArea = np.random.randint(50, high=2001, size=size, dtype=int)
nFlor = np.random.randint(1, high=11, size=size, dtype=int)
BedAbv = np.random.normal(loc=50.0, scale=5.0, size=size)
county = np.random.randint(1, high=51, size=size, dtype=int)

# Syntetic categorical data
locs = np.array(['north', 'south', 'center','east','west'])
Location = np.tile(locs, int(size/5))
np.random.shuffle(Location)

neigh = np.array(['Croxton','Croxton East','Dennis','Merri','Northcote South','Ruckers Hill','Westgarth',
'Preston','Bell','Darebin Park','Gilberton','Northland','Preston Lake','Regent','Regent West','South Preston',
'Sylvester','Reservoir','Gilbank','Regent','Reservoir North','Ruthven','Summerhill','Thornbury','Thornbury North'])
Neighborhood = np.tile(neigh, int(size/25))
np.random.shuffle(Location)

#--------------------------------------------------
# Define a function to obtain prices for later testing the model
def fun_price(y,lot,flor,bed,county):    
    return np.sin(y/2020)*lot*flor + bed - county/2

def col_price(row):
    y = row.Year
    flor = row.nFlor
    lot = row.LotArea
    bed = row.BedAbv
    county = row.county
    return int(fun_price(y,lot,flor,bed,county))

home_data = pd.DataFrame({'Year':Year, 'LotArea':LotArea, 'nFlor':nFlor,'BedAbv':BedAbv, 
                          'county':county, 'Location':Location,'Neighborhood':Neighborhood})
home_data['Price'] = home_data.apply(col_price,axis=1)
#--------------------------------------------------
# Syntetics NaN
iy = [10,100,120,1400]
inf = [123,234]
for i in iy:
    home_data.Year.iloc[i] = np.nan
for i in inf:
    home_data.nFlor.iloc[i] = np.nan
    
home_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,Year,LotArea,nFlor,BedAbv,county,Location,Neighborhood,Price
0,2015.0,1898,2.0,42.653353,25,west,Croxton,3219
1,2017.0,794,1.0,46.266266,50,south,Croxton East,688
2,2000.0,1740,2.0,45.756677,49,west,Dennis,2930
3,2000.0,1043,10.0,50.615652,48,east,Merri,8746
4,2016.0,1119,2.0,46.949269,11,south,Northcote South,1922
...,...,...,...,...,...,...,...,...
9995,1999.0,373,9.0,44.973070,3,south,Reservoir North,2849
9996,2011.0,373,8.0,54.609748,7,north,Ruthven,2554
9997,2004.0,484,4.0,51.388192,36,west,Summerhill,1654
9998,2015.0,1823,7.0,43.907255,23,south,Thornbury,10753


In [28]:
##################################################
#######    Prepare data
##################################################

#--------------------------------------------------------------------------------------------------
# Remove rows with missing target, separate target from predictors
X_full = home_data
X_full.dropna(axis=0, subset=['Price'], inplace=True)

y = X_full.Price
X_full.drop(['Price'], axis=1, inplace=True)

#--------------------------------------------------------------------------------------------------
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

#--------------------------------------------------------------------------------------------------
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

#--------------------------------------------------------------------------------------------------
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = numerical_cols + categorical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

X_train

Unnamed: 0,Year,LotArea,nFlor,BedAbv,county,Location
7389,2008.0,1713,2.0,47.764157,31,east
9275,2008.0,472,3.0,50.789410,15,west
2995,2016.0,1714,3.0,50.324580,40,south
5316,2009.0,194,6.0,55.203330,42,east
356,2020.0,1010,5.0,43.348395,27,north
...,...,...,...,...,...,...
9225,2010.0,956,7.0,47.297350,42,west
4859,1998.0,505,4.0,56.829622,24,center
3264,2003.0,1057,10.0,47.087097,48,south
9845,2006.0,1384,10.0,45.917230,9,center


In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

#--------------------------------------------------------------
# Preprocessing for numerical data (fill NaNs)
numerical_transformer = SimpleImputer(strategy='constant')

#--------------------------------------------------------------
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#--------------------------------------------------------------
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


#--------------------------------------------------------------
#--------------------------------------------------------------
# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline_model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])
#--------------------------------------------------------------
# Preprocessing of training data, fit model 
my_pipeline_model.fit(X_train, y_train)
# Preprocessing of validation data, get predictions
preds = my_pipeline_model.predict(X_valid)



#--------------------------------------------------------------
# X_train keeps its structure!
#--------------------------------------------------------------



print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 15.963484999999993
