In [13]:
train_file_path = r"C:\Users\User\Machine-Learning-Project\housing\artifact\data_ingestion\2022-09-25-01-07-32\ingested_data\train\housing.csv"

In [14]:
import pandas as pd

In [15]:
df = pd.read_csv(train_file_path)

In [16]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [17]:
df.shape

(16512, 10)

In [18]:
# Target Column
df["median_house_value"]

0         72100.0
1        279600.0
2         82700.0
3        112500.0
4        238300.0
           ...   
16507    268500.0
16508     90400.0
16509    140400.0
16510    258100.0
16511     62700.0
Name: median_house_value, Length: 16512, dtype: float64

In [19]:
# splitting x and y 
x, y = df.drop(columns=["median_house_value"], axis = 1), df[["median_house_value"]]

In [20]:
x.shape

(16512, 9)

In [21]:
y.shape

(16512, 1)

# Handling null values

In [23]:

x.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        158
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [24]:
numerical_col = df.drop('ocean_proximity', axis =1)

In [26]:
numerical_col.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [27]:
# populating null values using median
from sklearn.impute import SimpleImputer

simple_imputer = SimpleImputer(strategy="median")
simple_imputer.fit_transform(numerical_col)


array([[-1.2146e+02,  3.8520e+01,  2.9000e+01, ...,  7.0600e+02,
         2.1736e+00,  7.2100e+04],
       [-1.1723e+02,  3.3090e+01,  7.0000e+00, ...,  7.6800e+02,
         6.3373e+00,  2.7960e+05],
       [-1.1904e+02,  3.5370e+01,  4.4000e+01, ...,  3.0000e+02,
         2.8750e+00,  8.2700e+04],
       ...,
       [-1.2272e+02,  3.8440e+01,  4.8000e+01, ...,  1.7200e+02,
         3.1797e+00,  1.4040e+05],
       [-1.2270e+02,  3.8310e+01,  1.4000e+01, ...,  5.0100e+02,
         4.1964e+00,  2.5810e+05],
       [-1.2214e+02,  3.9970e+01,  2.7000e+01, ...,  1.9700e+02,
         3.1319e+00,  6.2700e+04]])

SimpleImputer - sklearn.impute - <u>SimpleImputer</u> replaces missing values using a descriptive statistic (e.g. mean, median, or most frequent) along each column, or using a constant value

In [29]:
simple_imputer.transform(numerical_col)

array([[-1.2146e+02,  3.8520e+01,  2.9000e+01, ...,  7.0600e+02,
         2.1736e+00,  7.2100e+04],
       [-1.1723e+02,  3.3090e+01,  7.0000e+00, ...,  7.6800e+02,
         6.3373e+00,  2.7960e+05],
       [-1.1904e+02,  3.5370e+01,  4.4000e+01, ...,  3.0000e+02,
         2.8750e+00,  8.2700e+04],
       ...,
       [-1.2272e+02,  3.8440e+01,  4.8000e+01, ...,  1.7200e+02,
         3.1797e+00,  1.4040e+05],
       [-1.2270e+02,  3.8310e+01,  1.4000e+01, ...,  5.0100e+02,
         4.1964e+00,  2.5810e+05],
       [-1.2214e+02,  3.9970e+01,  2.7000e+01, ...,  1.9700e+02,
         3.1319e+00,  6.2700e+04]])

In [30]:
simple_imputer.feature_names_in_

array(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'], dtype=object)

In [31]:
# median values for all the columns
simple_imputer.statistics_

array([-1.18510e+02,  3.42600e+01,  2.90000e+01,  2.11900e+03,
        4.33000e+02,  1.16400e+03,  4.08000e+02,  3.54155e+00,
        1.79500e+05])

In [32]:
# manually calculating the median value
x.longitude.median()

-118.51

# Creating custom features for indepth data analysis

In [35]:
numerical_col.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [36]:
# These allow us the class to inherit Scikit-learn methods
# such as fit and transform
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

The classes we import from sklearn.base are the glue that makes it all work. They are what allow our function to fit in with Scikit-learn’s pipelines, and model selection tools. The BaseEstimator just gives it the get_params and set_params methods that all Scikit-learn estimators require. The TransformerMixin gives it the fit_transform method.

In [80]:
# Columns from which additional features can be created

COLUMN_TOTAL_ROOMS = 'total_rooms'
COLUMN_TOTAL_BEDROOMS = 'total_bedrooms'
COLUMN_POPULATION = 'population'
COLUMN_HOUSEHOLD = 'households'

class FeatureGenerator(BaseEstimator,TransformerMixin):
    
    def __init__(self, add_bedrooms_per_room = True,
                total_rooms_ix = 3,
                total_bedrooms_ix = 4,
                population_ix = 5,
                households_ix = 6,columns = None):
        """
        FeatureGenerator Initialization
        add_bedrooms_per_room: bool
        total_rooms_ix: int index number of total rooms columns
        population_ix: int index number of total population columns
        households_ix: int index number of  households columns
        total_bedrooms_ix: int index number of bedrooms columns
        """

        try:
            self.columns = columns
            # assigning index from data.columns
            if self.columns is not None:
                total_rooms_ix = self.columns.index[COLUMN_TOTAL_ROOMS]
                total_bedrooms_ix = self.columns.index[COLUMN_TOTAL_BEDROOMS]
                households_ix = self.columns.index[COLUMN_HOUSEHOLD]
                population_ix = self.columns.index[COLUMN_POPULATION]

            self.add_bedrooms_per_room = add_bedrooms_per_room
            self.total_rooms_ix = total_rooms_ix
            self.total_bedrooms_ix = total_bedrooms_ix
            self.households_ix = households_ix
            self.population_ix = population_ix

        except Exception as e:
            raise e

    def fit(self, X, y = None):
        return self

    def transform(self,X, y = None):
        try:
            # Creating new columns 
            room_per_household = X[:, self.total_rooms_ix] / X[:, self.households_ix]
            population_per_household =  X[:, self.population_ix] / X[:, self.households_ix]

            # if add_bedroom_per_feature = True
            if self.add_bedrooms_per_room:
                bedroom_per_room = X[:, self.total_bedrooms_ix] / X[:, self.total_rooms_ix]

                generated_features = np.c_[X, bedroom_per_room, room_per_household, population_per_household]

            else: 
                generated_features = np.c_[X, room_per_household, population_per_household]

            return generated_features
            
        except Exception as e:
            raise e

            
    

# Preparing processed_data.pkl file

In [81]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

Pipeline - sklearn.pipeline - <u>Pipeline</u> utility helps to automate machine learning workflows. Pipelines work by allowing for a linear sequence of data transforms to be chained together culminating in a modeling process that can be evaluated

ColumnTransformer -  sklearn.compose - <u>ColumnTransformer</u> - The ColumnTransformer is a class in the scikit-learn Python machine learning library that allows you to selectively apply data preparation transforms.

For example, it allows you to apply a specific transform or sequence of transforms to just the numerical columns, and a separate sequence of transforms to just the categorical columns.

In [82]:
# Numerical Feature Pipeline
numerical_feature_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('feature_generator', FeatureGenerator()),
    ('scaling', StandardScaler())]
)

In [83]:
# Categorical Feature Pipeline
categorical_feature_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder', OneHotEncoder()),
    ('scaling', StandardScaler(with_mean=False))])


In [84]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [85]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [86]:
# creating list of numerical and categorical features to pass it to the ColumnTransformer
numerical_col = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',]

categorical_col = ['ocean_proximity']

In [89]:
# ColumnTransform on numerical and categorical columns
final_preprocessing = ColumnTransformer([
    ('numerical_feature_pipeline',numerical_feature_pipeline,numerical_col),
    ('categorical_feature_pipeline',categorical_feature_pipeline,categorical_col)])

In [90]:
final_preprocessing.fit_transform(df)

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  2.9869105 ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

# Transforming the test file 

In [91]:
test_file_path = r"C:\Users\User\Machine-Learning-Project\housing\artifact\data_ingestion\2022-09-25-01-07-32\ingested_data\test\housing.csv"

In [92]:
test_df = pd.read_csv(test_file_path)

In [93]:
final_preprocessing.transform(test_df)

array([[ 0.59229422, -0.71065803,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [-0.42180959, -0.35049119, -0.37006852, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.56232071, -0.64985064,  0.5842485 , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.07211862, -0.56097831,  1.14093342, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.83208232, -0.93985512,  0.10708999, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.50736927, -0.67791559,  0.5842485 , ...,  0.        ,
         0.        ,  0.        ]])

# Saving as a pickle file

In [95]:
import dill

In [96]:
with open('preprocessing.pkl','wb') as prep_file:
    dill.dump(final_preprocessing,prep_file)

How to deserialize a prrprocessing pickle file and load it in original state

In [97]:
with open('preprocessing.pkl','rb') as file_obj:
    preprocessing_model_obj = dill.load(file_obj)

In [99]:
preprocessing_model_obj.transform(df)

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  2.9869105 ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

In [1]:
import os

In [2]:
os.getcwd()

'c:\\Users\\User\\Machine-Learning-Project\\notebook'

In [3]:
os.chdir('c:\\Users\\User\\Machine-Learning-Project')