In [11]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor

# Create some syntetic data
size = 10000

Year = np.random.randint(1998, high=2021, size=size, dtype=int)
LotArea = np.random.randint(50, high=2001, size=size, dtype=int)
nFlor = np.random.randint(1, high=11, size=size, dtype=int)
BedAbv = np.random.normal(loc=50.0, scale=5.0, size=size)
county = np.random.randint(1, high=51, size=size, dtype=int)

# Define a function to obtain prices for later testing the model
def fun_price(y,lot,flor,bed,county):    
    return np.sin(y/2020)*lot*flor + bed - county/2

def col_price(row):
    y = row.Year
    flor = row.nFlor
    lot = row.LotArea
    bed = row.BedAbv
    county = row.county
    return int(fun_price(y,lot,flor,bed,county))

home_data = pd.DataFrame({'Year':Year, 'LotArea':LotArea, 'nFlor':nFlor, 'BedAbv':BedAbv, 'county':county})
home_data['Price'] = home_data.apply(col_price,axis=1)
home_data

Unnamed: 0,Year,LotArea,nFlor,BedAbv,county,Price
0,2008,1405,10,51.211164,21,11818
1,2015,114,3,52.584165,1,339
2,2002,1230,9,50.226015,40,9291
3,2006,1934,7,54.193594,1,11394
4,2012,1125,3,47.380961,4,2878
...,...,...,...,...,...,...
9995,1999,1039,10,62.714423,34,8729
9996,2008,270,7,46.424750,20,1620
9997,2010,335,5,52.347775,24,1445
9998,2019,218,1,49.300942,46,209


In [12]:
iy = [10,100,120,1400]
inf = [123,234]
for i in iy:
    home_data.Year.iloc[i] = np.nan
for i in inf:
    home_data.nFlor.iloc[i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [17]:
print(home_data.shape)
print(home_data.isnull().sum())

(10000, 6)
Year       4
LotArea    0
nFlor      2
BedAbv     0
county     0
Price      0
dtype: int64


In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Regression Tree Model Fit
y = home_data.Price
feature_columns = ['Year', 'LotArea', 'nFlor', 'BedAbv', 'county']
X = home_data[feature_columns]
# split data into training and validation data, for both features and target
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state = 1)

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [23]:
# 1. Drop columns with missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

# Fill in the lines below: drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
2130.636296


In [27]:
# 2. Imputation

from sklearn.impute import SimpleImputer

# Fill in the lines below: imputation
# Imputation: “mean”, “median”, “most_frequent”, “constant”
# SimpleImputer(missing_values=np.nan, strategy='mean')
my_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Imputation):
15.042824
