In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

data = pd.read_csv('melb_data.csv')


def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [39]:
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [40]:
y = data.Price
X = data.drop(['Price'], axis=1)

In [41]:
num_features = X.select_dtypes(exclude='object')
categorical_features = X.select_dtypes(include='object')

In [42]:
data.shape

(13580, 21)

# Missing values numerical

In [43]:
num_features.isnull().sum()

Rooms               0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
Lattitude           0
Longtitude          0
Propertycount       0
dtype: int64

In [44]:
man_inputed = num_features.copy()

In [45]:
miss_columns = [col for col in num_features.columns if num_features[col].isnull().any()]

In [46]:
for col in miss_columns:
    man_inputed[col].fillna(man_inputed[col].mean(), inplace=True)

In [47]:
man_inputed.isnull().sum()

Rooms            0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
Lattitude        0
Longtitude       0
Propertycount    0
dtype: int64

In [48]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
autoimpute = pd.DataFrame(imputer.fit_transform(num_features))

In [49]:
autoimpute.columns = num_features.columns

# Categorical variables

In [50]:
categorical_features = categorical_features.drop(['CouncilArea'], axis=1)

In [51]:
categorical_features.describe(include=['O'])

Unnamed: 0,Suburb,Address,Type,Method,SellerG,Date,Regionname
count,13580,13580,13580,13580,13580,13580,13580
unique,314,13378,3,5,268,58,8
top,Reservoir,1/1 Clarendon St,h,S,Nelson,27/05/2017,Southern Metropolitan
freq,359,3,9449,9022,1565,473,4695


In [52]:
low_cardinality_cols = [col for col in categorical_features.columns if categorical_features[col].nunique() < 10]

In [53]:
fina_category = categorical_features[low_cardinality_cols]

# Encoding categorical

In [54]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in fina_category.columns:
    fina_category[col] = label_encoder.fit_transform(fina_category[col])
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [55]:
fina_category.head()

Unnamed: 0,Type,Method,Regionname
0,0,1,2
1,0,1,2
2,0,3,2
3,0,0,2
4,0,4,2


In [56]:
data_1 = pd.concat([autoimpute, fina_category], axis=1)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(data_1, y, train_size=0.8, test_size=0.2)

In [58]:
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(X_train, X_test, y_train, y_test))

MAE from Approach 1 (Drop categorical variables):
174743.82430044183


In [59]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [60]:
one_hot = categorical_features[low_cardinality_cols].copy()

In [61]:
one_hote_data = pd.DataFrame(OH_encoder.fit_transform(one_hot))

In [62]:
one_hote_data.index = one_hot.index

In [63]:
data_2 = pd.concat([one_hote_data, fina_category], axis=1)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(data_2, y, train_size=0.8, test_size=0.2)

In [65]:
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(X_train, X_test, y_train, y_test))

MAE from Approach 1 (Drop categorical variables):
324347.92176058254


# Pipeline

In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('melb_data.csv')

y = data.Price
X = data.drop(['Price'], axis=1)

X_train_full, X_valid_full, y_train_full, y_valid_full = train_test_split(X, y, train_size=0.8, 
                                                                          test_size=0.2, random_state=0)

categorical_cols = [col for col in X_train_full.columns 
                    if X_train_full[col].nunique() < 10 and X_train_full[col].dtype == 'object']
categorical_cols

['Type', 'Method', 'Regionname']

In [75]:
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]
numerical_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [80]:
my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


#### 1. Define Preprocessing Steps

In [81]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [82]:
numerical_transformer = SimpleImputer(strategy = 'median')
categorical_transformer = Pipeline(steps=[('impoter', SimpleImputer(strategy = 'most_frequent')),
                                         ('oneht', OneHotEncoder(handle_unknown='ignore'))])

In [83]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    
    ])

In [84]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

In [85]:
from sklearn.metrics import mean_absolute_error
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)])

In [88]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train_full)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid_full, preds)
print('MAE:', score)

MAE: 161879.65212497368
