# Model Deployment Steps

In [36]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import set_config
set_config(display='diagram')

import time
# Record the start time
start_time = time.time()

In [37]:
data = pd.read_csv(r"C:\Users\dhami\Downloads\GitHub_Capstone_Project\CTREA-Dynamics\data\Pipeline_Dataset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884233 entries, 0 to 884232
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   884233 non-null  int64  
 1   List Year                    884233 non-null  int64  
 2   Assessed Value               884233 non-null  float64
 3   Sale Amount                  884233 non-null  float64
 4   Sales Ratio                  884233 non-null  float64
 5   Property Type                884233 non-null  object 
 6   Street Number                884233 non-null  int64  
 7   Minimum Estimated Occupancy  884233 non-null  int64  
 8   County                       884233 non-null  object 
 9   Reason Category              843753 non-null  object 
 10  year                         884233 non-null  int64  
 11  month                        884233 non-null  int64  
 12  day                          884233 non-null  int64  
dtyp

In [38]:
#define X and y
X = data.drop(columns=['Unnamed: 0','Sales Ratio','Sale Amount','month', 'day'], axis=1)
y = data['Sale Amount']

In [39]:
# Define the drop_columns function
# def drop_columns(X):
#     return X.drop(columns=columns_to_drop)

#applying log transformation 
def apply_log_transformation(X):
    X['Assessed Value'] = np.log1p(X['Assessed Value'])
    return X

        #creating new features for data comphresion
        def create_new_features(X):
            X['Assessed_Sales_Ratio'] = X['Assessed Value'] * X['Sales Ratio']
            #if you have more features and need to call same use same code
            #keep adding new feautres logic herein 
            return X

In [40]:
#columns_to_drop = ['month', 'day']
categorical_features = ['Property Type', 'County', 'Reason Category']

In [41]:
#create pipeline to drop with custom transformer
pipeline = Pipeline([
    #('feature_engineering', FunctionTransformer(create_new_features)),
    #step 1 : feature engineering
    ('log_transformation', FunctionTransformer(apply_log_transformation)),
    #step 2 : data transformation for machine learning model optimization
    #('drop_columns', FunctionTransformer(drop_columns)),
    #step 3 : identify features to drop 
    ('preprocessor', ColumnTransformer(
        transformers = [('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
        remainder = 'passthrough'
    )),
    #step 4 : numerical and categorical numerical encoding for numerical follow same step add line after categorical
    # Feature selection
    ('feature_selection', SelectKBest(f_regression, k=10)),
    #Regression Model 
    ('XGBoost_Regressor', XGBRegressor())
    #step 5 : Passing data through model for training and testing
])


In [42]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((707386, 8), (176847, 8), (707386,), (176847,))

In [43]:
# Apply transformations to target for test set
y_train_transformed = np.log1p(y_train)

# Fit the entire pipeline on the training data
pipeline.fit(X_train, y_train_transformed)

In [44]:
# Apply transformations to target for test set
y_test_transformed = np.log1p(y_test)

# Transform the test set using the pipeline
X_test_transformed = pipeline[:-1].transform(X_test)
# -1 will leave XGboost not to be applied for other steps in pipline will be done!!

# Make predictions on the transformed test set
y_pred_transformed = pipeline[-1].predict(X_test_transformed)

# Inverse transform predictions to get them back to the original scale
y_pred = np.expm1(y_pred_transformed)

# Evaluate the model
mse = mean_squared_error(y_test_transformed, y_pred_transformed)
mae = mean_absolute_error(y_test_transformed, y_pred_transformed)
r2 = r2_score(y_test_transformed, y_pred_transformed)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R2 Score: {r2}')

# Record the end time
end_time = time.time()
# Calculate the total time
total_time = end_time - start_time
# Print the total time
print(f"Total execution time: {total_time} seconds")

Mean Squared Error: 0.09792490478322377
Mean Absolute Error: 0.20223910664208916
R2 Score: 0.8546805840679981
Total execution time: 4.424556732177734 seconds


# Pickle 

In [45]:
import pickle

# Specify the file path where you want to save the pickle file
pickle_file_path = 'pipeline_model.pkl'

# Save the pipeline to a pickle file
with open(pickle_file_path, 'wb') as file:
    pickle.dump(pipeline, file)

print(f'Pipeline saved to {pickle_file_path}')

Pipeline saved to pipeline_model.pkl


In [46]:
ls

 Volume in drive C is OS
 Volume Serial Number is AE01-C01E

 Directory of C:\Users\dhami\Downloads\GitHub_Capstone_Project\CTREA-Dynamics\notebooks

31-01-2024  14:08    <DIR>          .
30-01-2024  16:53    <DIR>          ..
27-01-2024  01:24    <DIR>          .ipynb_checkpoints
30-01-2024  02:57    <DIR>          __pycache__
27-01-2024  02:54                68 abc.txt
25-01-2024  12:55         1,032,504 assessor_output.txt
25-01-2024  12:55         1,553,075 Assessor_unique_values.txt
25-01-2024  12:54           102,325 assesssor_real_estate_keywords.txt
04-01-2024  00:04           289,803 Data_Assessment.ipynb
30-01-2024  17:27            67,474 Data_encoding.ipynb
30-01-2024  19:59               114 demo.py
27-01-2024  02:56             2,768 Deployment_part_1.ipynb
31-01-2024  14:08            25,430 Deployment_Steps.ipynb
11-01-2024  01:26           854,547 EDA.ipynb
25-01-2024  12:55           115,850 final_assessor_output.txt
11-01-2024  01:06           169,232 Outliers_Treatm

In [47]:
#deserialisation 
reloaded_pickle = pickle.load(open('pipeline_model.pkl','rb'))

In [48]:
data[9:10]

Unnamed: 0.1,Unnamed: 0,List Year,Assessed Value,Sale Amount,Sales Ratio,Property Type,Street Number,Minimum Estimated Occupancy,County,Reason Category,year,month,day
9,9,2020,168900.0,352000.0,0.4798,Residential,39,5,Litchfield County,Other,2021,8,10


In [49]:
data.values[9]

array([9, 2020, 168900.0, 352000.0, 0.4798, 'Residential', 39, 5,
       'Litchfield County', 'Other', 2021, 8, 10], dtype=object)

In [50]:
# Assuming 'Sale Amount' is the name of the target variable column
input_data = [[9, 2020, 168900.0, 352000.0, 0.4798, 'Residential', 39, 5,
       'Litchfield County', 'Other', 2021, 8, 10]]

sale_amount = input_data[0][3]

# Create a DataFrame from the input list
input_df = pd.DataFrame(input_data, columns=['Unnamed: 0', 'List Year', 'Assessed Value', 'Sale Amount',
       'Sales Ratio', 'Property Type', 'Street Number',
       'Minimum Estimated Occupancy', 'County', 'Reason Category', 'year',
       'month', 'day'])

# Extract the target variable
target_variable = input_df['Sale Amount']

# Drop the target variable before using the pipeline
input_df = input_df.drop(columns=['Sale Amount'])

# Use the pipeline for prediction
prediction = reloaded_pickle.predict(input_df)

prediction = float(prediction)
prediction

12.606732368469238

In [51]:
#inverse of prediction for actual value
predict_value = np.expm1(prediction)
predict_value

298560.84266669047

In [52]:
sale_amount

352000.0

In [53]:
sa = np.log1p(sale_amount)
sa

12.77138929548529

In [54]:
#difference between true and preidction
error = sale_amount-predict_value
error

53439.15733330953

In [55]:
diff = sa - prediction
diff

0.16465692701605228

In [56]:
np.expm1(diff)

0.17898857019370834

In [57]:
np.log1p(error)

10.886317752317524

# Joblib

In [58]:
import joblib  
# For scikit-learn versions < 0.22
# For scikit-learn versions >= 0.22, use: from sklearn import joblib

# Assuming you have trained the pipeline and stored it in the variable 'pipeline'
joblib.dump(pipeline, 'regression_pipeline.joblib')

# Load the saved pipeline
reloaded_pipeline = joblib.load('regression_pipeline.joblib')

# Now, you can use 'reloaded_pipeline' to make predictions
y_pred = reloaded_pipeline.predict(X_test)

y_pred[:10]

array([9.418462, 9.26592 , 9.5395  , 9.430487, 9.493425, 9.33935 ,
       9.785561, 9.785561, 9.166654, 9.20828 ], dtype=float32)

In [59]:
np.expm1(y_pred)

array([12312.627, 10570.528, 13897.   , ..., 22252.613, 14927.105,
       34627.477], dtype=float32)

In [60]:
y_pred = reloaded_pipeline.predict(input_df)
y_pred

array([9.70481], dtype=float32)

In [61]:
170000.0-9780.757

160219.243

In [62]:
np.expm1(float(y_pred))

16395.286291298802

In [63]:
y_test_transformed.head(10)

364213    12.506181
685188    11.608245
245987    12.802161
702718    11.728045
40463     12.128117
246708    12.468056
321763    11.089821
333937    12.206078
803652    12.255344
561435    11.512935
Name: Sale Amount, dtype: float64

    The pickle is performing better than joblib!!

# Streamlit Application 

In [64]:
import streamlit as st

In [71]:
%%writefile demo.py
import streamlit as st
import streamlit as st
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

st.header('Machine Learning Regression Model')
list_year = st.selectbox('Property Listing Year',
                         [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
                          2015, 2016, 2017, 2018, 2019, 2020, 2021])
sale_recorded_year = st.selectbox('Property Sale Recorded Year',
                                  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
                                   2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])
assessed_value = st.number_input('Property Valuation in USD', min_value=0.0, value=10.0,
                                 placeholder="Type a number...")
sales_ratio = st.number_input('Sales Ratio', min_value=0.00, max_value=5.00, placeholder="Type a number...")
property_type = st.selectbox('Property Type', ['Commercial', 'Residential', 'Vacant Land', 'Miscellaneous',
                                               'Apartments', 'Industrial', 'Public Utility', 'Condo',
                                               'Two Family', 'Single Family', 'Four Family', 'Three Family'])
street_number = st.number_input('Street Number ', min_value=0, value=0, placeholder="Type a number...")
Minimum_Estimated_Occupancy = st.number_input('Minimum Estimated Occupancy', min_value=1, max_value=16,
                                              placeholder="Type a number...")
county = st.selectbox('County', ['New Haven County', 'Windham County', 'Hartford County',
                                 'Fairfield County', 'Litchfield County', 'Middlesex County',
                                 'New London County', 'Tolland County', 'Other'])
reason_sale = st.selectbox('Reason for sale', ['Not defined', ' Foreclosure', ' Other', ' Family',
                                               ' Change in Property', ' Plottage', ' Use Assessment', ' Tax',
                                               ' In Lieu Of Foreclosure', ' Two Towns', ' A Will',
                                               ' Portion of Property', ' Part Interest', ' Government Agency',
                                               ' Charitable Group', ' Court Order', ' Rehabilitation Deferred',
                                               ' Inter Corporation', ' Money and Personal Property',
                                               ' Non Buildable Lot', ' Correcting Deed', ' Deed Date',
                                               ' CRUMBLING FOUNDATION ASSESSMENT REDUCTION', ' Bankrupcy',
                                               ' Auction', ' Love and Affection', ' Personal Property Exchange',
                                               ' Zoning', ' Easement', ' Cemetery', ' No Consideration'])

button = st.button('Predict Sale Amount')

data = {'List Year': [list_year],
        'year': [sale_recorded_year],
        'Assessed Value': [assessed_value],
        'Sales Ratio': [sales_ratio],
        'Property Type': [property_type],
        'Street Number': [street_number],
        'Minimum Estimated Occupancy': [Minimum_Estimated_Occupancy],
        'County': [county],
        'Reason Category': [reason_sale]}

st.markdown('User Inputs')
input_data = pd.DataFrame(data)
st.dataframe(input_data.T)


# Define the drop_columns function
def drop_columns(input_data):
    return input_data.drop(columns=columns_to_drop)

#applying log transformation
def apply_log_transformation(input_data):
    input_data['Assessed Value'] = np.log1p(input_data['Assessed Value'])
    return input_data

columns_to_drop = ['Sales Ratio']
categorical_features = ['Property Type', 'County', 'Reason Category']

# deserialization
reloaded_pickle = pickle.load(open("C:\\Users\\dhami\\Downloads\\Cap_Data\\Real_notebooks\\pipeline_model.pkl", 'rb'))
# Use the pipeline for prediction
prediction = reloaded_pickle.predict(input_data)
prediction = float(prediction)
#inverse of prediction for actual value
predict_value = np.expm1(prediction)
st.write(predict_value)

Overwriting demo.py


In [72]:
#!wget -q -O - ipv4.icanhazip.com
#! streamlit run demo.py 
#& npx localtunnel --port 8501
#this works on google colab use ip address on turner for website access

^C


In [67]:
#run the streamlit command in local host in terminal thnen only it works like did in PyCharm 

# Final Streamlit code

In [73]:
%%writefile regression_model.py
import streamlit as st
import pandas as pd
import numpy as np
import pickle

st.header('Machine Learning Regression Model')
list_year = st.selectbox('Property Listing Year',
                         [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
                          2015, 2016, 2017, 2018, 2019, 2020, 2021])
sale_recorded_year = st.selectbox('Property Sale Recorded Year',
                                  [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
                                   2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])
assessed_value = st.number_input('Property Valuation in USD', min_value=0.0, value=10.0,
                                 placeholder="Type a number...")
sales_ratio = st.number_input('Sales Ratio', min_value=0.00, max_value=5.00, placeholder="Type a number...")
property_type = st.selectbox('Property Type', ['Commercial', 'Residential', 'Vacant Land', 'Miscellaneous',
                                               'Apartments', 'Industrial', 'Public Utility', 'Condo',
                                               'Two Family', 'Single Family', 'Four Family', 'Three Family'])
street_number = st.number_input('Street Number ', min_value=0, value=0, placeholder="Type a number...")
Minimum_Estimated_Occupancy = st.number_input('Minimum Estimated Occupancy', min_value=1, max_value=16,
                                              placeholder="Type a number...")
county = st.selectbox('County', ['New Haven County', 'Windham County', 'Hartford County',
                                 'Fairfield County', 'Litchfield County', 'Middlesex County',
                                 'New London County', 'Tolland County', 'Other'])
reason_sale = st.selectbox('Reason for sale', ['Not defined', ' Foreclosure', ' Other', ' Family',
                                               ' Change in Property', ' Plottage', ' Use Assessment', ' Tax',
                                               ' In Lieu Of Foreclosure', ' Two Towns', ' A Will',
                                               ' Portion of Property', ' Part Interest', ' Government Agency',
                                               ' Charitable Group', ' Court Order', ' Rehabilitation Deferred',
                                               ' Inter Corporation', ' Money and Personal Property',
                                               ' Non Buildable Lot', ' Correcting Deed', ' Deed Date',
                                               ' CRUMBLING FOUNDATION ASSESSMENT REDUCTION', ' Bankrupcy',
                                               ' Auction', ' Love and Affection', ' Personal Property Exchange',
                                               ' Zoning', ' Easement', ' Cemetery', ' No Consideration'])

button = st.button('Predict Sale Amount')

data = {'List Year': [list_year],
        'year': [sale_recorded_year],
        'Assessed Value': [assessed_value],
        'Sales Ratio': [sales_ratio],
        'Property Type': [property_type],
        'Street Number': [street_number],
        'Minimum Estimated Occupancy': [Minimum_Estimated_Occupancy],
        'County': [county],
        'Reason Category': [reason_sale]}

st.markdown('User Inputs')
input_data = pd.DataFrame(data)
st.dataframe(input_data.T)

if button:
    # Define the drop_columns function
    def drop_columns(input_data):
        return input_data.drop(columns=columns_to_drop)

    # applying log transformation
    def apply_log_transformation(input_data):
        input_data['Assessed Value'] = np.log1p(input_data['Assessed Value'])
        return input_data

    columns_to_drop = ['Sales Ratio']
    categorical_features = ['Property Type', 'County', 'Reason Category']

    # deserialization
    reloaded_pickle = pickle.load(open("C:\\Users\\dhami\\Downloads\\Cap_Data\\Real_notebooks\\pipeline_model.pkl", 'rb'))
    # Use the pipeline for prediction
    prediction = reloaded_pickle.predict(input_data)
    prediction = float(prediction)
    # inverse of prediction for actual value
    predict_value = np.expm1(prediction)
    st.write('Predicted Sale Amount:', predict_value)


Writing regression_model.py


In [74]:
! streamlit run regression_model.py 

^C
