In [3]:
# data manipulation and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging 

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)
pd.options.display.max_columns = None
pd.options.display.max_rows = None

logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')

# Config

In [1]:
# columns to read

RANDOM_SEED = 42


# # categorical variables with NA in train set
# CATEGORICAL_VARS_WITH_NA_FREQUENT = []


# CATEGORICAL_VARS_WITH_NA_MISSING = []


# # numerical variables with NA in train set
# NUMERICAL_VARS_WITH_NA = []


# TEMPORAL_VARS = []


# # variables to log transform
# NUMERICALS_LOG_VARS = []


# onehot encode
ONE_HOT_VARS = ['type']

# categorical variables to encode
CATEGORICAL_VARS = [
            'district',
            'constituency',
            'postcode_district',]


FEATURES = ['district',
            'constituency',
            'postcode_district',
            'average_income',
            'type',
            'year',
           'price']

DROP_VARS = []

In [None]:
DROP_VARS.remove('price')

In [None]:
DROP_VARS

In [None]:
X_test

In [None]:
import yaml

dct_str = {'features': FEATURES }
print(yaml.dump(dct_str))

In [4]:
# load dataset
data = pd.read_csv('../zanasonic/datasets/processed/pp_nottinghamshire.csv', usecols=FEATURES)

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(380989, 7)


Unnamed: 0,price,type,district,constituency,postcode_district,average_income,year
0,53000,D,Broxtowe,Ashfield,NG16,39600.0,1995
1,28000,T,Nottingham,Nottingham North,NG8,25000.0,1995
2,67000,D,Newark and Sherwood,Newark,NG23,47800.0,1995
3,33500,S,Nottingham,Nottingham North,NG8,31900.0,1995
4,45000,S,Mansfield,Mansfield,NG19,38300.0,1995


In [6]:

# Split the data - train, validation and test
train_set, test_set = train_test_split(data,
                                       test_size=0.25,
                                       random_state=RANDOM_SEED)

# test_set, validation_set = train_test_split(test_set,
#                                             test_size=0.20,
#                                             random_state=RANDOM_SEED)

# # load the pre-selected features
# # ==============================

X_train = train_set.drop('price', axis = 1)
X_test = test_set.drop('price', axis = 1)

y_train = train_set['price']
y_test = test_set['price']

logging.info(f"Training shape: {train_set.shape}")
# logging.info(f"Validation shape: {validation_set.shape}")
logging.info(f"Test shape: {test_set.shape}")

# Save the split files
# train_set.to_csv("../data/processed/train.csv", index=False)
# validation_set.to_csv("../data/processed/validation.csv", index=False)
# test_set.to_csv("../data/processed/test.csv", index=False)

2021-07-17 21:20:51,556:INFO:Training shape: (285741, 7)
2021-07-17 21:20:51,557:INFO:Test shape: (95248, 7)


In [8]:
# set up the pipeline
transform_pipeline = Pipeline([

    # ==== VARIABLE TRANSFORMATION =====
    #('log', LogTransformer(variables=NUMERICALS_LOG_VARS)),
    
    ('one_hot_encode', OneHotEncoder(top_categories=None, variables=ONE_HOT_VARS, drop_last=True)),

    # == CATEGORICAL ENCODING
    ('rare_label_encoder', RareLabelEncoder(
        tol=0.01, n_categories=3, variables=CATEGORICAL_VARS
    )),

    # encode categorical and discrete variables using the target mean
    ('categorical_encoder', OrdinalEncoder(
         encoding_method='ordered', variables=CATEGORICAL_VARS)),
     ('scaler', MinMaxScaler()),
     ('model', RandomForestRegressor())
    
])

In [None]:
DROP_VARS

In [9]:
transform_pipeline.fit(X_train, y_train)

Pipeline(steps=[('one_hot_encode',
                 OneHotEncoder(drop_last=True, variables=['type'])),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=3, tol=0.01,
                                  variables=['district', 'constituency',
                                             'postcode_district'])),
                ('categorical_encoder',
                 OrdinalEncoder(variables=['district', 'constituency',
                                           'postcode_district'])),
                ('scaler', MinMaxScaler()),
                ('model', RandomForestRegressor())])

In [None]:
X_train.columns

In [None]:
X_train = transform_pipeline.transform(X_train)
X_test = transform_pipeline.transform(X_test)

In [None]:
X_train.shape

In [10]:
# evaluate the model:
# ====================

# make predictions for train set
pred = transform_pipeline.predict(X_train)


# determine mse, rmse and r2
print('train mse: {}'.format(int(
    mean_squared_error(y_train, pred))))
print('train rmse: {}'.format(int(
    mean_squared_error(y_train, pred, squared=False))))
print('train r2: {}'.format(
    r2_score(y_train, pred)))
print()



train mse: 1979912832
train rmse: 44496
train r2: 0.7487038046423148



In [11]:
# make predictions for test set
pred = transform_pipeline.predict(X_test)

# determine mse, rmse and r2
print('test mse: {}'.format(int(
    mean_squared_error(y_test, pred))))
print('test rmse: {}'.format(int(
    mean_squared_error(y_test, pred, squared=False))))
print('test r2: {}'.format(
    r2_score(y_test, pred)))
print()

print('Average house price: ', int(y_train.median()))

test mse: 2300856678
test rmse: 47967
test r2: 0.7108408091626419

Average house price:  110000


In [None]:
feature_importance = transform_pipeline.named_steps['model'].feature_importances_

In [None]:
feature_importance.shape

In [None]:
X_train.shape

In [None]:
# what are scores for the features
features = X_train.columns
scores = list(feature_importance)

feature_column = []
feature_score = []
for i in range(len(scores)):
    feature_column.append(features[i])
    feature_score.append(scores[i])
df_feature = pd.DataFrame({'score': feature_score}, index=feature_column)

In [None]:
df_feature.sort_values(by='score', ascending=False)

In [None]:
sorted(zip(feature_importance, features), reverse=True)

In [None]:
transform_pipeline_2.fit(X_train, y_train)

In [None]:
scores = list(transform_pipeline.named_steps['selector'].scores_)


In [None]:
scores_2 = list(transform_pipeline_2.named_steps['selector'].scores_)

In [None]:
X_train.columns[transform_pipeline.named_steps['selector'].get_support()].to_list()

In [None]:
# what are scores for the features
features = X_train.columns
feature_column = []
feature_score = []
for i in range(len(scores)):
    feature_column.append(features[i])
    feature_score.append(scores[i])
df_feature = pd.DataFrame({'score': feature_score}, index=feature_column)

In [None]:
df_feature.sort_values(by='score')

In [None]:
X_train_2 = transform_pipeline_2.transform(X_train)
X_test_2 = transform_pipeline_2.transform(X_test)

# create scaler
scaler = MinMaxScaler()

#  fit  the scaler to the train set
scaler.fit(X_train_2)

# transform the train and test set

# sklearn returns numpy arrays, so we wrap the
# array with a pandas dataframe

X_train = pd.DataFrame(
    scaler.transform(X_train_2),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test_2),
    columns=X_train.columns
)

In [None]:
X_train.head()

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from matplotlib import pyplot as plt


# # feature selection
def select_features(X_train, y_train):
    # configure to select all features
    fs = SelectKBest(score_func=f_regression, k=7)
    # learn relationship from training data
    fs.fit(X_train, y_train)
    # transform train input data
#     X_train_fs = fs.transform(X_train)
    # transform test input data
#     X_test_fs = fs.transform(X_test)
    return fs

# features = list(X_train)
fs = select_features(X_train, y_train)

selected_features = X_train.columns[fs.get_support()].to_list()


scores_2 = list(fs.scores_)


# what are scores for the features
features = list(X_train.columns)
feature_column_2 = []
feature_score_2 = []
for i in range(len(scores_2)):
    feature_column.append(features[i])
    feature_score.append(scores_2[i])
df_feature_2 = pd.DataFrame({'score': feature_score_2}, index=feature_column_2)
df_feature_2
# import yaml

# dct_str = {'features': selected_features }
# print(dct_str)
# stream = open("../data/processed/features.yaml", "w")
# yaml.dump(dct_str, stream)


# # feature selection
# X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# new_features = X_train.columns[fs.get_support()]
# df_feature = pd.DataFrame({'feature': new_features})
# df_feature.to_csv("../data/processed/selected_features.csv", index=False)

# what are scores for the features
# feature_column = []
# feature_score = []
# for i in range(len(fs.scores_)):
#     feature_column.append(features[i])
#     feature_score.append(fs.scores_[i])
# df_feature = pd.DataFrame({'score': feature_score}, index=feature_column)
# df_feature[['score']] = df_feature[['score']].apply(pd.to_numeric)
# df_feature = df_feature.sort_values(by='score')
# df_feature[df_feature['score'] > 3000].plot(kind='bar')
# plt.tight_layout()
# plt.show()
# plt.savefig('./reports/figures/feature_importance.png')
# df = df_feature[df_feature['score'] > 3000]
# df.index.names = ['feature']
# df.to_csv("./data/processed/selected_features.csv", index=True)

In [None]:
# what are scores for the features
features = list(X_train.columns)
feature_column_2 = []
feature_score_2 = []
for i in range(len(scores_2)):
    feature_column_2.append(features[i])
    feature_score_2.append(scores_2[i])
df_feature_2 = pd.DataFrame({'score': feature_score_2}, index=feature_column_2)
df_feature_2

In [None]:
df_feature_2.sort_values(by='score')

In [None]:
X_train.columns[fs.get_support()].to_list()

In [None]:
feature_column_2

In [None]:
X_train.columns

In [None]:
fs.scores_

In [None]:
import uuid
print (uuid.uuid4())

In [None]:
import os

## Get input ##
myfile= "../data/processed/features.yaml"

## Try to delete the file ##
try:
    os.remove(myfile)
except OSError as e:  ## if failed, report it back to the user ##
    print ("Error: %s - %s." % (e.filename, e.strerror))