In [1]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging 

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)
pd.options.display.max_columns = None
pd.options.display.max_rows = None

logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

# Config

In [12]:
# columns to read

RANDOM_SEED = 42

DF_COLUMNS = ['id',
              'date',
              'month',
              'year',
              'quarter',
             'price',
             'postcode',
             'type',
             'new_build',
             'land',
             'primary_address',
             'secondary_address',
             'street',
             'latitude',
             'longitude',
             'grid_ref',
             'county',
             'district',
             'ward',
             'constituency',
             'region',
             'middle_layer_super_output_area',
             'postcode_area',
             'postcode_district',
             'nearest_station',
             'distance_to_station',
             'police_force',
             'water_company',
             'average_income',
             'sewage_company',
             'travel_to_work_area',
             'rural_urban',
             'altitude',
             'region_name',
             'area_code',
             'adjusted_price']

# categorical variables with NA in train set
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['primary_address', 'street', 'water_company']


CATEGORICAL_VARS_WITH_NA_MISSING = ['secondary_address', 'sewage_company']


# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = []


TEMPORAL_VARS = []


# variables to log transform
NUMERICALS_LOG_VARS = []


# onehot encode
ONE_HOT_VARS = ['type']

# categorical variables to encode
CATEGORICAL_VARS = [
            'district',
            'constituency',
            'postcode_district',]


FEATURES = ['district',
            'constituency',
            'postcode_district',
            'average_income',
            'type',
            'year',
           'price']

DROP_VARS = list(set(DF_COLUMNS) - set(FEATURES))

In [15]:
DROP_VARS.remove('price')

In [11]:
DROP_VARS

['ward',
 'latitude',
 'nearest_station',
 'water_company',
 'altitude',
 'adjusted_price',
 'quarter',
 'county',
 'area_code',
 'longitude',
 'middle_layer_super_output_area',
 'date',
 'distance_to_station',
 'street',
 'id',
 'region_name',
 'rural_urban',
 'secondary_address',
 'sewage_company',
 'primary_address',
 'postcode',
 'new_build',
 'grid_ref',
 'postcode_area',
 'police_force',
 'travel_to_work_area',
 'region',
 'price',
 'land',
 'month']

In [21]:
import yaml

dct_str = {'features': FEATURES }
print(yaml.dump(dct_str))

features:
- district
- constituency
- postcode_district
- average_income
- type
- year



In [22]:
# load dataset
data = pd.read_csv('../zanasonic/datasets/processed/pp_nottinghamshire.csv', usecols=DF_COLUMNS)

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(380989, 36)


Unnamed: 0,id,price,date,postcode,type,new_build,land,primary_address,secondary_address,street,latitude,longitude,grid_ref,county,district,ward,constituency,region,middle_layer_super_output_area,postcode_area,postcode_district,nearest_station,distance_to_station,police_force,water_company,average_income,sewage_company,travel_to_work_area,rural_urban,altitude,region_name,area_code,adjusted_price,year,month,quarter
0,{D489CA47-975B-40E3-B03C-E14855D19F1E},53000,1995-01-03,NG16 3GW,D,N,F,47,,CASTLE STREET,53.013391,-1.295037,SK473464,Nottinghamshire,Broxtowe,Eastwood Hilltop,Ashfield,East Midlands,Eastwood East,NG,NG16,Langley Mill,2.47545,Nottinghamshire,Severn Trent,39600.0,,Nottingham,Urban minor conurbation,106.0,Broxtowe,E07000172,237813,1995,1,1
1,{4B099008-EA0C-454B-9370-5C5C863E5794},28000,1995-01-03,NG8 6HL,T,N,F,67,,LINDFIELD ROAD,52.979664,-1.212121,SK529427,Nottinghamshire,Nottingham,Aspley,Nottingham North,East Midlands,Broxtowe & Cinderhill,NG,NG8,Bulwell,2.46669,Nottinghamshire,Severn Trent,25000.0,,Nottingham,Urban minor conurbation,88.0,City of Nottingham,E06000018,128563,1995,1,1
2,{8356569B-3F11-41DF-9529-B35DF5B3F745},67000,1995-01-03,NG23 6NX,D,N,F,GREENSLEEVES,,CASTLE HILL,53.169733,-0.820201,SK789642,Nottinghamshire,Newark and Sherwood,Sutton-on-Trent,Newark,East Midlands,"Muskham, Sutton on Trent & Walesby",NG,NG23,Collingham,5.45494,Nottinghamshire,Severn Trent,47800.0,,Lincoln,Rural village,17.0,Newark and Sherwood,E07000175,292041,1995,1,1
3,{1DD76457-0C65-41E2-AF97-6BEBC8E4B63C},33500,1995-01-03,NG8 4NA,S,N,F,20,,BURNSIDE ROAD,52.961959,-1.231961,SK516407,Nottinghamshire,Nottingham,Bilborough,Nottingham North,East Midlands,Bilborough South,NG,NG8,Ilkeston,4.64723,Nottinghamshire,Severn Trent,31900.0,,Nottingham,Urban minor conurbation,68.0,City of Nottingham,E06000018,154141,1995,1,1
4,{2678B602-4C74-462F-9ABE-249F7C8A19D9},45000,1995-01-03,NG19 0BT,S,N,F,109,,CLIPSTONE ROAD WEST,53.154774,-1.157879,SK564622,Nottinghamshire,Mansfield,Holly,Mansfield,East Midlands,Forest Town & Newlands,NG,NG19,Mansfield Town,3.0476,Nottinghamshire,Severn Trent,38300.0,,Mansfield,Urban city and town,123.0,Mansfield,E07000174,187467,1995,1,1


In [23]:

# Split the data - train, validation and test
train_set, test_set = train_test_split(data,
                                       test_size=0.30,
                                       random_state=RANDOM_SEED)

test_set, validation_set = train_test_split(test_set,
                                            test_size=0.20,
                                            random_state=RANDOM_SEED)

# # load the pre-selected features
# # ==============================

X_train = train_set.drop('price', axis = 1)
X_test = validation_set.drop('price', axis = 1)

y_train = train_set['price']
y_test = validation_set['price']

logging.info(f"Training shape: {train_set.shape}")
logging.info(f"Validation shape: {validation_set.shape}")
logging.info(f"Test shape: {test_set.shape}")

# Save the split files
# train_set.to_csv("../data/processed/train.csv", index=False)
# validation_set.to_csv("../data/processed/validation.csv", index=False)
# test_set.to_csv("../data/processed/test.csv", index=False)

2021-07-17 20:54:35,999:INFO:Training shape: (266692, 36)
2021-07-17 20:54:36,000:INFO:Validation shape: (22860, 36)
2021-07-17 20:54:36,001:INFO:Test shape: (91437, 36)


In [24]:
# set up the pipeline
transform_pipeline = Pipeline([

    # ===== IMPUTATION =====
    # impute categorical variables with string missing
    ('missing_imputation', CategoricalImputer(
        imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)),

    ('frequent_imputation', CategoricalImputer(
        imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)),

    # add missing indicator
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),

    # impute numerical variables with the mean
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA
    )),

    ('drop_features', DropFeatures(features_to_drop=DROP_VARS)),

    # ==== VARIABLE TRANSFORMATION =====
    #('log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
    ('one_hot_encode', OneHotEncoder(top_categories=None, variables=ONE_HOT_VARS, drop_last=True)),

    # == CATEGORICAL ENCODING
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=3, variables=CATEGORICAL_VARS
    )),

    # encode categorical and discrete variables using the target mean
    ('categorical_encoder', OrdinalEncoder(
         encoding_method='ordered', variables=CATEGORICAL_VARS)),
     ('scaler', MinMaxScaler())#,
     #('model', RandomForestRegressor())
#     ('selector', SelectKBest(score_func=f_regression, k=7))
    
])

In [None]:
DROP_VARS

In [25]:
transform_pipeline.fit(X_train, y_train)

Pipeline(steps=[('missing_imputation',
                 CategoricalImputer(variables=['secondary_address',
                                               'sewage_company'])),
                ('frequent_imputation',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['primary_address', 'street',
                                               'water_company'])),
                ('missing_indicator', AddMissingIndicator(variables=[])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='me...
                                                'travel_to_work_area', 'region',
                                                'land', 'month'])),
                ('one_hot_encode',
                 OneHotEncoder(drop_last=True, variables=['type'])),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=3, tol=0.01,
                                  variables=['district

In [26]:
X_train.columns

Index(['id', 'date', 'postcode', 'type', 'new_build', 'land',
       'primary_address', 'secondary_address', 'street', 'latitude',
       'longitude', 'grid_ref', 'county', 'district', 'ward', 'constituency',
       'region', 'middle_layer_super_output_area', 'postcode_area',
       'postcode_district', 'nearest_station', 'distance_to_station',
       'police_force', 'water_company', 'average_income', 'sewage_company',
       'travel_to_work_area', 'rural_urban', 'altitude', 'region_name',
       'area_code', 'adjusted_price', 'year', 'month', 'quarter'],
      dtype='object')

In [28]:
X_train = transform_pipeline.transform(X_train)
X_test = transform_pipeline.transform(X_test)

In [32]:
X_train.shape

(266692, 9)

In [18]:
# evaluate the model:
# ====================

# make predictions for train set
pred = transform_pipeline.predict(X_train)


# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(
    mean_squared_error(y_train, pred, squared=False))))
print('train r2: {}'.format(
    r2_score(y_train, pred)))
print()



train mse: 1973859429
train rmse: 44428
train r2: 0.7497090708946375



In [19]:
# make predictions for test set
pred = transform_pipeline.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(
    mean_squared_error(y_test, pred, squared=False))))
print('test r2: {}'.format(
    r2_score(y_test, pred)))
print()

print('Average house price: ', int(y_train.median()))

test mse: 2238030752
test rmse: 47307
test r2: 0.7182510918084004

Average house price:  110000


In [None]:
feature_importance = transform_pipeline.named_steps['model'].feature_importances_

In [None]:
feature_importance.shape

In [None]:
X_train.shape

In [None]:
# what are scores for the features
features = X_train.columns
scores = list(feature_importance)

feature_column = []
feature_score = []
for i in range(len(scores)):
    feature_column.append(features[i])
    feature_score.append(scores[i])
df_feature = pd.DataFrame({'score': feature_score}, index=feature_column)

In [None]:
df_feature.sort_values(by='score', ascending=False)

In [None]:
sorted(zip(feature_importance, features), reverse=True)

In [None]:
transform_pipeline_2.fit(X_train, y_train)

In [None]:
scores = list(transform_pipeline.named_steps['selector'].scores_)


In [None]:
scores_2 = list(transform_pipeline_2.named_steps['selector'].scores_)

In [None]:
X_train.columns[transform_pipeline.named_steps['selector'].get_support()].to_list()

In [None]:
# what are scores for the features
features = X_train.columns
feature_column = []
feature_score = []
for i in range(len(scores)):
    feature_column.append(features[i])
    feature_score.append(scores[i])
df_feature = pd.DataFrame({'score': feature_score}, index=feature_column)

In [None]:
df_feature.sort_values(by='score')

In [None]:
X_train_2 = transform_pipeline_2.transform(X_train)
X_test_2 = transform_pipeline_2.transform(X_test)

# create scaler
scaler = MinMaxScaler()

#  fit  the scaler to the train set
scaler.fit(X_train_2)

# transform the train and test set

# sklearn returns numpy arrays, so we wrap the
# array with a pandas dataframe

X_train = pd.DataFrame(
    scaler.transform(X_train_2),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test_2),
    columns=X_train.columns
)

In [None]:
X_train.head()

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from matplotlib import pyplot as plt


# # feature selection
def select_features(X_train, y_train):
    # configure to select all features
    fs = SelectKBest(score_func=f_regression, k=7)
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
#     X_train_fs = fs.transform(X_train)
    # transform test input data
#     X_test_fs = fs.transform(X_test)
    return fs

# features = list(X_train)
fs = select_features(X_train, y_train)

selected_features = X_train.columns[fs.get_support()].to_list()


scores_2 = list(fs.scores_)


# what are scores for the features
features = list(X_train.columns)
feature_column_2 = []
feature_score_2 = []
for i in range(len(scores_2)):
    feature_column.append(features[i])
    feature_score.append(scores_2[i])
df_feature_2 = pd.DataFrame({'score': feature_score_2}, index=feature_column_2)
df_feature_2
# import yaml

# dct_str = {'features': selected_features }
# print(dct_str)
# stream = open("../data/processed/features.yaml", "w")
# yaml.dump(dct_str, stream)


# # feature selection
# X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# new_features = X_train.columns[fs.get_support()]
# df_feature = pd.DataFrame({'feature': new_features})
# df_feature.to_csv("../data/processed/selected_features.csv", index=False)

# what are scores for the features
# feature_column = []
# feature_score = []
# for i in range(len(fs.scores_)):
#     feature_column.append(features[i])
#     feature_score.append(fs.scores_[i])
# df_feature = pd.DataFrame({'score': feature_score}, index=feature_column)
# df_feature[['score']] = df_feature[['score']].apply(pd.to_numeric)
# df_feature = df_feature.sort_values(by='score')
# df_feature[df_feature['score'] > 3000].plot(kind='bar')
# plt.tight_layout()
# plt.show()
# plt.savefig('./reports/figures/feature_importance.png')
# df = df_feature[df_feature['score'] > 3000]
# df.index.names = ['feature']
# df.to_csv("./data/processed/selected_features.csv", index=True)

In [None]:
# what are scores for the features
features = list(X_train.columns)
feature_column_2 = []
feature_score_2 = []
for i in range(len(scores_2)):
    feature_column_2.append(features[i])
    feature_score_2.append(scores_2[i])
df_feature_2 = pd.DataFrame({'score': feature_score_2}, index=feature_column_2)
df_feature_2

In [None]:
df_feature_2.sort_values(by='score')

In [None]:
X_train.columns[fs.get_support()].to_list()

In [None]:
feature_column_2

In [None]:
X_train.columns

In [None]:
fs.scores_

In [None]:
import uuid
print (uuid.uuid4())

In [None]:
import os

## Get input ##
myfile= "../data/processed/features.yaml"

## Try to delete the file ##
try:
    os.remove(myfile)
except OSError as e:  ## if failed, report it back to the user ##
    print ("Error: %s - %s." % (e.filename, e.strerror))