# Model Deployment Steps

In [35]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import set_config
set_config(display='diagram')

import time
# Record the start time
start_time = time.time()

In [36]:
data = pd.read_csv(r"C:\Users\dhami\Downloads\GitHub_Capstone_Project\CTREA-Dynamics\data\Pipeline_Dataset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884233 entries, 0 to 884232
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   884233 non-null  int64  
 1   List Year                    884233 non-null  int64  
 2   Assessed Value               884233 non-null  float64
 3   Sale Amount                  884233 non-null  float64
 4   Sales Ratio                  884233 non-null  float64
 5   Property Type                884233 non-null  object 
 6   Street Number                884233 non-null  int64  
 7   Minimum Estimated Occupancy  884233 non-null  int64  
 8   County                       884233 non-null  object 
 9   Reason Category              843753 non-null  object 
 10  year                         884233 non-null  int64  
 11  month                        884233 non-null  int64  
 12  day                          884233 non-null  int64  
dtyp

In [37]:
#define X and y
X = data.drop('Sale Amount', axis=1)
y = data['Sale Amount']

In [38]:
# Define the drop_columns function
def drop_columns(X):
    return X.drop(columns=columns_to_drop)

#applying log transformation 
def apply_log_transformation(X):
    X['Assessed Value'] = np.log1p(X['Assessed Value'])
    return X

        #creating new features for data comphresion
        def create_new_features(X):
            X['Assessed_Sales_Ratio'] = X['Assessed Value'] * X['Sales Ratio']
            #if you have more features and need to call same use same code
            #keep adding new feautres logic herein 
            return X

In [39]:
columns_to_drop = ['Unnamed: 0', 'Sales Ratio', 'month', 'day']
categorical_features = ['Property Type', 'County', 'Reason Category']

In [40]:
#create pipeline to drop with custom transformer
pipeline = Pipeline([
    #('feature_engineering', FunctionTransformer(create_new_features)),
    #step 1 : feature engineering
    ('log_transformation', FunctionTransformer(apply_log_transformation)),
    #step 2 : data transformation for machine learning model optimization
    ('drop_columns', FunctionTransformer(drop_columns)),
    #step 3 : identify features to drop 
    ('preprocessor', ColumnTransformer(
        transformers = [('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
        remainder = 'passthrough'
    )),
    #step 4 : numerical and categorical numerical encoding for numerical follow same step add line after categorical
    # Feature selection
    ('feature_selection', SelectKBest(f_regression, k=10)),
    #Regression Model 
    ('XGBoost_Regressor', XGBRegressor())
    #step 5 : Passing data through model for training and testing
])


In [41]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((707386, 12), (176847, 12), (707386,), (176847,))

In [42]:
# Apply transformations to target for test set
y_train_transformed = np.log1p(y_train)

# Fit the entire pipeline on the training data
pipeline.fit(X_train, y_train_transformed)

In [43]:
# Apply transformations to target for test set
y_test_transformed = np.log1p(y_test)

# Transform the test set using the pipeline
X_test_transformed = pipeline[:-1].transform(X_test)
# -1 will leave XGboost not to be applied for other steps in pipline will be done!!

# Make predictions on the transformed test set
y_pred_transformed = pipeline[-1].predict(X_test_transformed)

# Inverse transform predictions to get them back to the original scale
y_pred = np.expm1(y_pred_transformed)

# Evaluate the model
mse = mean_squared_error(y_test_transformed, y_pred_transformed)
mae = mean_absolute_error(y_test_transformed, y_pred_transformed)
r2 = r2_score(y_test_transformed, y_pred_transformed)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R2 Score: {r2}')

# Record the end time
end_time = time.time()
# Calculate the total time
total_time = end_time - start_time
# Print the total time
print(f"Total execution time: {total_time} seconds")

Mean Squared Error: 0.09792490478322377
Mean Absolute Error: 0.20223910664208916
R2 Score: 0.8546805840679981
Total execution time: 3.1186017990112305 seconds


# Pickle 

In [44]:
import pickle

# Specify the file path where you want to save the pickle file
pickle_file_path = 'pipeline_model.pkl'

# Save the pipeline to a pickle file
with open(pickle_file_path, 'wb') as file:
    pickle.dump(pipeline, file)

print(f'Pipeline saved to {pickle_file_path}')

Pipeline saved to pipeline_model.pkl


In [45]:
ls

 Volume in drive C is OS
 Volume Serial Number is AE01-C01E

 Directory of c:\Users\dhami\Downloads\GitHub_Capstone_Project\CTREA-Dynamics\notebooks

27-01-2024  02:44    <DIR>          .
07-01-2024  08:53    <DIR>          ..
27-01-2024  01:24    <DIR>          .ipynb_checkpoints
27-01-2024  02:52                68 abc.txt
25-01-2024  12:55         1,032,504 assessor_output.txt
25-01-2024  12:55         1,553,075 Assessor_unique_values.txt
25-01-2024  12:54           102,325 assesssor_real_estate_keywords.txt
04-01-2024  00:04           289,803 Data_Assessment.ipynb
24-01-2024  23:04            64,472 Data_encoding.ipynb
27-01-2024  02:52               114 demo.py
27-01-2024  02:54            27,911 Deployment_Steps.ipynb
11-01-2024  01:26           854,547 EDA.ipynb
25-01-2024  12:55           115,850 final_assessor_output.txt
11-01-2024  01:06           169,232 Outliers_Treatment.ipynb
27-01-2024  02:54           482,918 pipeline_model.pkl
27-01-2024  02:51         1,904,499 Regrees

In [46]:
#deserialisation 
reloaded_pickle = pickle.load(open('pipeline_model.pkl','rb'))

In [47]:
data[9:10]

Unnamed: 0.1,Unnamed: 0,List Year,Assessed Value,Sale Amount,Sales Ratio,Property Type,Street Number,Minimum Estimated Occupancy,County,Reason Category,year,month,day
9,9,2020,168900.0,352000.0,0.4798,Residential,39,5,Litchfield County,Other,2021,8,10


In [48]:
data.values[9]

array([9, 2020, 168900.0, 352000.0, 0.4798, 'Residential', 39, 5,
       'Litchfield County', 'Other', 2021, 8, 10], dtype=object)

In [49]:
# Assuming 'Sale Amount' is the name of the target variable column
input_data = [[9, 2020, 168900.0, 352000.0, 0.4798, 'Residential', 39, 5,
       'Litchfield County', 'Other', 2021, 8, 10]]

sale_amount = input_data[0][3]

# Create a DataFrame from the input list
input_df = pd.DataFrame(input_data, columns=['Unnamed: 0', 'List Year', 'Assessed Value', 'Sale Amount',
       'Sales Ratio', 'Property Type', 'Street Number',
       'Minimum Estimated Occupancy', 'County', 'Reason Category', 'year',
       'month', 'day'])

# Extract the target variable
target_variable = input_df['Sale Amount']

# Drop the target variable before using the pipeline
input_df = input_df.drop(columns=['Sale Amount'])

# Use the pipeline for prediction
prediction = reloaded_pickle.predict(input_df)

prediction = float(prediction)
prediction

  prediction = float(prediction)


12.606732368469238

In [50]:
#inverse of prediction for actual value
predict_value = np.expm1(prediction)
predict_value

298560.84266669047

In [51]:
sale_amount

352000.0

In [52]:
sa = np.log1p(sale_amount)
sa

12.77138929548529

In [53]:
#difference between true and preidction
error = sale_amount-predict_value
error

53439.15733330953

In [54]:
diff = sa - prediction
diff

0.16465692701605228

In [55]:
np.expm1(diff)

0.17898857019370834

In [56]:
np.log1p(error)

10.886317752317524

# Joblib

In [57]:
import joblib  
# For scikit-learn versions < 0.22
# For scikit-learn versions >= 0.22, use: from sklearn import joblib

# Assuming you have trained the pipeline and stored it in the variable 'pipeline'
joblib.dump(pipeline, 'regression_pipeline.joblib')

# Load the saved pipeline
reloaded_pipeline = joblib.load('regression_pipeline.joblib')

# Now, you can use 'reloaded_pipeline' to make predictions
y_pred = reloaded_pipeline.predict(X_test)

y_pred[:10]

array([9.418462, 9.26592 , 9.5395  , 9.430487, 9.493425, 9.33935 ,
       9.785561, 9.785561, 9.166654, 9.20828 ], dtype=float32)

In [58]:
np.expm1(y_pred)

array([12312.627, 10570.528, 13897.   , ..., 22252.613, 14927.105,
       34627.477], dtype=float32)

In [59]:
y_pred = reloaded_pipeline.predict(input_df)
y_pred

array([9.70481], dtype=float32)

In [60]:
170000.0-9780.757

160219.243

In [61]:
np.expm1(float(y_pred))

  np.expm1(float(y_pred))


16395.286291298802

In [62]:
y_test_transformed.head(10)

364213    12.506181
685188    11.608245
245987    12.802161
702718    11.728045
40463     12.128117
246708    12.468056
321763    11.089821
333937    12.206078
803652    12.255344
561435    11.512935
Name: Sale Amount, dtype: float64

    The pickle is performing better than joblib!!

# Streamlit Application 

In [63]:
import streamlit as st

In [64]:
%%writefile abc.txt 
#magic command
Welcome to PCGP DSML PROGRAMME 
Machine Learning 

Overwriting abc.txt


In [65]:
%%writefile demo.py
import streamlit as st
st.title('My first streamlit app')
st.header('Hello')
st.subheader('This a subheader')

Overwriting demo.py


In [66]:
#!streamlit run demo.py & npx localtunnel --port 8501

In [68]:
#!pip install streamlit
# app.py
import streamlit as st

def main():
    st.title("My Streamlit App")
    # Your Streamlit app code here

if __name__ == "__main__":
    main()

# !streamlit run demo.py


from IPython.display import IFrame

# Adjust the URL if your Streamlit app is running on a different port
url = "http://localhost:8501"
IFrame(src=url, width=1000, height=600)
