In [130]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [131]:
df = pd.read_csv("melb_data.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [132]:
x = df.drop('Price', axis=1)
y = df['Price']

In [133]:
x.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Method', 'SellerG', 'Date',
       'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',
       'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude',
       'Regionname', 'Propertycount'],
      dtype='object')

In [134]:
x = x.fillna(method='bfill')
x.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
YearBuilt           0
CouncilArea      1367
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [135]:
import category_encoders as ce
encoder = ce.OrdinalEncoder()
x = encoder.fit_transform(x)

In [136]:
from sklearn.model_selection import train_test_split

# Split the dataset into 30% test and 70% training
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [137]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [138]:
from sklearn.metrics import r2_score
y_pred = lr.predict(x_test)
print("r2 score: ",r2_score(y_test, y_pred))
print(lr.score(x_train, y_train))

r2 score:  0.5478171085800607
0.5691280102063156


In [139]:
from sklearn.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(lr, n_features_to_select=2, direction='backward')
sfs.fit(x_train, y_train)

SequentialFeatureSelector(direction='backward', estimator=LinearRegression(),
                          n_features_to_select=2)

In [140]:
feature_names = x.columns
print('The most important features based on random forest classifier:')
for feature_list_index in sfs.get_support(indices=True):
    print('- ' + feature_names[feature_list_index])

The most important features based on random forest classifier:
- Rooms
- Distance


In [141]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
selector = SelectFromModel(estimator=RandomForestRegressor())
selector.fit(x_train, y_train)

SelectFromModel(estimator=RandomForestRegressor())

In [142]:
print('The most important features based on random forest classifier:')
for feature_list_index in selector.get_support(indices=True):
    print('- ' + feature_names[feature_list_index])

The most important features based on random forest classifier:
- Rooms
- Type
- Distance
- Postcode
- Bathroom
- Landsize


In [143]:
selector.threshold_

0.05

In [144]:
x_imp_train = selector.transform(x_train)
x_imp_test = selector.transform(x_test)

# Print the results
print('Number of features before transformation: {}'.format(x_train.shape[1]))
print('Number of features after transformation: {}'.format(x_imp_train.shape[1]))

Number of features before transformation: 20
Number of features after transformation: 6


In [145]:
from sklearn.metrics import accuracy_score

In [146]:
rfr_full = RandomForestRegressor()

rfr_full.fit(x_train, y_train)

# Make predictions
pred_full = rfr_full.predict(x_test)

# Generate accuracy score
print("r2 score: ",r2_score(y_test, pred_full))

r2 score:  0.7697417426254691


In [147]:
rfr_lim = RandomForestRegressor()

# Train the classifier with limited features
rfr_lim.fit(x_imp_train, y_train)

# Make predictions
pred_lim = rfr_lim.predict(x_imp_test)

print("r2 score: ",r2_score(y_test, pred_lim))

r2 score:  0.7293941076689135


In [148]:
from mlxtend.feature_selection import SequentialFeatureSelector
sel = SequentialFeatureSelector(estimator=lr,k_features=(1,20), forward=True)
sel.fit(x_train, y_train)

SequentialFeatureSelector(estimator=LinearRegression(), k_features=(1, 20))

In [149]:
sel.k_feature_names_

('Suburb',
 'Address',
 'Rooms',
 'Type',
 'Method',
 'SellerG',
 'Distance',
 'Postcode',
 'Bathroom',
 'Car',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Regionname')

In [150]:
sel.k_score_

0.5645593409742389

In [151]:
sel.k_feature_idx_

(0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 14, 16, 17, 18)

In [160]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
efs = ExhaustiveFeatureSelector(estimator=LinearRegression(), min_features=1, max_features=2, print_progress=True)
efs = efs.fit(x_train, y_train)
print(efs.best_feature_names_)

Features: 210/210

UnboundLocalError: local variable 'best_subset' referenced before assignment

In [156]:
efs.best_feature_names_