# Server-based deployment 

In [1]:
import numpy as np  # for manipulation
import pandas as pd  # for data loading
import urllib.request  # for data downloading
import tarfile  # for extracting data

from sklearn.model_selection import StratifiedShuffleSplit  # for splitting data
from sklearn.preprocessing import StandardScaler  # for scaling the attributes
from sklearn.preprocessing import OneHotEncoder  # for handling categorical features
from sklearn.impute import SimpleImputer   # for handling missing data
from sklearn.linear_model import LinearRegression  # for creating model
from sklearn.metrics import mean_squared_error, r2_score  # for evaluation

import pickle  # for saving

# Custom class for combined attributes
class CombinedAttributesAdder():
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        
        X = np.delete(X, [households_ix, rooms_ix, population_ix, bedrooms_ix], 1)
        
        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    
# class for data preprocessing    
class data_preprocessing():
    def __init__(self):
        self.imputer = SimpleImputer(strategy="median")
        self.attr_add = CombinedAttributesAdder()
        self.stdscale = StandardScaler()
        self.ohe = OneHotEncoder(sparse=False)
    
    def fit(self, X): # fit and transform the training data
        house_num = X.drop("ocean_proximity", axis=1)
        house_cat = X[["ocean_proximity"]]
        
        # handle missing data
        self.imputer.fit(house_num)
        X_train_imp = self.imputer.transform(house_num)
        X_train_imp = pd.DataFrame(X_train_imp, columns=house_num.columns, index=X.index)
        
        # combined attributes
        housing_addtl_attr = self.attr_add.transform(X_train_imp.values)
        
        # scale the features
        self.stdscale.fit(housing_addtl_attr)
        X_train_imp_scaled = self.stdscale.transform(housing_addtl_attr)
        
        # handle categorical input feature
        self.ohe.fit(house_cat)
        X_train_ohe = self.ohe.transform(house_cat)
        
        # concatenate features
        X_train = np.concatenate([X_train_imp_scaled, X_train_ohe], axis=1)
        
        return X_train
        
    def transform(self, X): 
        # transform the test data (use the fitted imputer, 
        #                         standardscaler, onehotencoder, 
        #                         combinedattribute from training)
        house_num = X.drop("ocean_proximity", axis=1)
        house_cat = X[["ocean_proximity"]]
        
        # handle missing data
        X_test_imp = self.imputer.transform(house_num)
        X_test_imp = pd.DataFrame(X_test_imp, columns=house_num.columns, index=X.index)
        
        # combined attributes
        housing_addtl_attr = self.attr_add.transform(X_test_imp.values)
        
        # scale the features
        X_test_imp_scaled = self.stdscale.transform(housing_addtl_attr)
        
        # handle categorical input feature
        X_test_ohe = self.ohe.transform(house_cat)
        
        # concatenate features
        X_test = np.concatenate([X_test_imp_scaled, X_test_ohe], axis=1)
        
        return X_test
    
    def savefittedobject(self):
        pickle.dump(self.imputer, open('houseimputer.pkl', 'wb'))
        pickle.dump(self.stdscale, open('housescaler.pkl', 'wb'))
        pickle.dump(self.ohe, open('houseohencoder.pkl', 'wb'))
    
# load the dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz"
urllib.request.urlretrieve(url, "housing.tgz")
tar = tarfile.open("housing.tgz")
tar.extractall()
tar.close()
housing = pd.read_csv("housing.csv")

# split the data
housing["income_cat"] = pd.cut(housing["median_income"],
                              bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                              labels=[1, 2, 3, 4, 5])
split = StratifiedShuffleSplit(test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# assign the training data
train_housing = strat_train_set.drop("median_house_value", axis=1)
train_housing_labels = strat_train_set["median_house_value"].copy()

# get the column indices to be used in getting additional attributes
col_names = ["total_rooms", "total_bedrooms", "population", "households"]
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    train_housing.columns.get_loc(c) for c in col_names] # get the column indices

# preprocess the training data
house_preprocess = data_preprocessing()
data_X_train = house_preprocess.fit(train_housing)

# create the model
lin_reg = LinearRegression()

# train the model
lin_reg.fit(data_X_train, train_housing_labels)

# evaluate the model on training dataset
housing_predictions = lin_reg.predict(data_X_train)
lin_mse = mean_squared_error(train_housing_labels, housing_predictions)
lin_r2 = r2_score(train_housing_labels, housing_predictions)
print("Performance for Train dataset: ", lin_mse, np.sqrt(lin_mse), lin_r2)

# assign the test data
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

# preprocess the test data
data_X_test = house_preprocess.transform(X_test)

# test the trained model on test data
final_predictions = lin_reg.predict(data_X_test)

# evaluate the model on test dataset
final_mse = mean_squared_error(y_test, final_predictions)
print("Final performance evaluation: ", final_mse, np.sqrt(final_mse), 
      lin_reg.score(data_X_test, y_test))

Performance for Train dataset:  5001073666.967282 70718.26968306904 0.6241096275422756
Final performance evaluation:  5315171527.142865 72905.22290716122 0.6021304523441476


In [2]:
# saving model as pickle file
pickle.dump(lin_reg, open('houseregressionmodel.pkl', 'wb'))

In [3]:
# saving imputer, scaler and onehotencoder
house_preprocess.savefittedobject()