<h1><center>Supply Chain Management (Model Building) </center></h1>

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('supply_test.csv')

df['Regional_zone'] = df['zone'] + ' ' + df['WH_regional_zone']

df.head()

Unnamed: 0.1,Unnamed: 0,Ware_house_ID,WH_Manager_ID,Location_type,WH_capacity_size,zone,WH_regional_zone,num_refill_req_l3m,transport_issue_l1y,Competitor_in_mkt,...,dist_from_hub,workers_num,wh_est_year,storage_issue_reported_l3m,temp_reg_mach,approved_wh_govt_certificate,wh_breakdown_l3m,govt_check_l3m,product_wg_ton,Regional_zone
0,16621,WH_116621,EID_66621,Rural,Large,North,Zone 5,5,0,3,...,156,30.0,2006.0,24,0,A,2,5,30132,North Zone 5
1,16622,WH_116622,EID_66622,Rural,Large,North,Zone 5,5,0,2,...,79,31.0,2019.0,5,1,C,2,24,6075,North Zone 5
2,16623,WH_116623,EID_66623,Rural,Small,North,Zone 6,3,0,3,...,70,41.0,2008.0,19,1,A+,5,9,24076,North Zone 6
3,16624,WH_116624,EID_66624,Rural,Mid,West,Zone 4,5,2,2,...,255,33.0,2017.0,9,1,A+,3,11,13092,West Zone 4
4,16625,WH_116625,EID_66625,Urban,Mid,North,Zone 4,6,0,4,...,205,20.0,1999.0,25,0,B,4,26,29071,North Zone 4


In [4]:
test_df = df[['wh_est_year', 'storage_issue_reported_l3m','wh_breakdown_l3m','Location_type',
              'WH_capacity_size','wh_owner_type', 'approved_wh_govt_certificate','Regional_zone','product_wg_ton']]

test_df.isnull().sum()

wh_est_year                     2685
storage_issue_reported_l3m         0
wh_breakdown_l3m                   0
Location_type                      0
WH_capacity_size                   0
wh_owner_type                      0
approved_wh_govt_certificate     206
Regional_zone                      0
product_wg_ton                     0
dtype: int64

In [5]:
test_year = test_df['wh_est_year'].mode()[0]

test_certificate = test_df['approved_wh_govt_certificate'].mode()[0]

test_df.loc[test_df['wh_est_year'].isnull(), 'wh_est_year'] = test_year

test_df.loc[test_df['approved_wh_govt_certificate'].isnull(), 'approved_wh_govt_certificate'] = test_certificate

In [6]:
train_df = pd.read_csv('train_df.csv')

train_df.head()

Unnamed: 0.1,Unnamed: 0,wh_est_year,storage_issue_reported_l3m,wh_breakdown_l3m,Location_type,WH_capacity_size,wh_owner_type,approved_wh_govt_certificate,Regional_zone,product_wg_ton
0,0,2000.0,13,5,Urban,Small,Rented,A,West Zone 6,17115
1,1,2000.0,4,3,Rural,Large,Company Owned,A,North Zone 5,5074
2,2,2000.0,17,6,Rural,Mid,Company Owned,A,South Zone 2,23137
3,3,2000.0,17,3,Rural,Mid,Rented,A+,North Zone 3,22115
4,4,2009.0,18,6,Rural,Large,Company Owned,C,North Zone 5,24071


## Training and Testing data

In [7]:
X_train = train_df.drop(['product_wg_ton','Unnamed: 0'], axis = 1)

X_train.head()

Unnamed: 0,wh_est_year,storage_issue_reported_l3m,wh_breakdown_l3m,Location_type,WH_capacity_size,wh_owner_type,approved_wh_govt_certificate,Regional_zone
0,2000.0,13,5,Urban,Small,Rented,A,West Zone 6
1,2000.0,4,3,Rural,Large,Company Owned,A,North Zone 5
2,2000.0,17,6,Rural,Mid,Company Owned,A,South Zone 2
3,2000.0,17,3,Rural,Mid,Rented,A+,North Zone 3
4,2009.0,18,6,Rural,Large,Company Owned,C,North Zone 5


In [8]:
X_test = test_df.drop('product_wg_ton', axis = 1)

X_test.head()

Unnamed: 0,wh_est_year,storage_issue_reported_l3m,wh_breakdown_l3m,Location_type,WH_capacity_size,wh_owner_type,approved_wh_govt_certificate,Regional_zone
0,2006.0,24,2,Rural,Large,Company Owned,A,North Zone 5
1,2019.0,5,2,Rural,Large,Company Owned,C,North Zone 5
2,2008.0,19,5,Rural,Small,Rented,A+,North Zone 6
3,2017.0,9,3,Rural,Mid,Rented,A+,West Zone 4
4,1999.0,25,4,Urban,Mid,Rented,B,North Zone 4


In [9]:
y_train = train_df['product_wg_ton']

y_train.head()

0    17115
1     5074
2    23137
3    22115
4    24071
Name: product_wg_ton, dtype: int64

In [10]:
y_test = test_df['product_wg_ton']

y_test.head()

0    30132
1     6075
2    24076
3    13092
4    29071
Name: product_wg_ton, dtype: int64

## Building Model

In [11]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import ipywidgets as widgets
from IPython.display import display
from math import sqrt

In [12]:
categorical_features = ['Location_type', 'WH_capacity_size', 'wh_owner_type', 'approved_wh_govt_certificate', 'Regional_zone']
numerical_features = ['wh_est_year', 'storage_issue_reported_l3m', 'wh_breakdown_l3m']

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))])
numerical_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer( transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)],
        remainder='passthrough')

In [13]:
lr  = LinearRegression()
dt  = DecisionTreeRegressor()
rf  = RandomForestRegressor()
ada = AdaBoostRegressor()
xgb = XGBRegressor()

regressors = [('LinearRegression', lr), ('DecisionTreeRegressor', dt), ('RandomForestRegressor', rf), 
              ('AdaBoostRegressor', ada), ('XGBRegressor' , xgb)]

In [14]:
print('r2_score \n')

for regressor_name, regressor in regressors:
    
    pipeline = Pipeline(steps=[
              ('preprocessor', preprocessor),
              ('regressor', regressor)])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    accuracy = r2_score(y_test, y_pred)
    
    print(f'{regressor_name}: {accuracy * 100:.2f}%')

r2_score 

LinearRegression: 98.53%
DecisionTreeRegressor: 98.71%
RandomForestRegressor: 99.18%
AdaBoostRegressor: 97.87%
XGBRegressor: 99.23%


In [15]:
print('Mean Absolute Error \n')

for regressor_name, regressor in regressors:
    
    pipeline = Pipeline(steps=[
              ('preprocessor', preprocessor),
              ('regressor', regressor)])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f'{regressor_name}: {mae}')

Mean Absolute Error 

LinearRegression: 1062.377147766323
DecisionTreeRegressor: 938.8122909635931
RandomForestRegressor: 786.0246776081146
AdaBoostRegressor: 1350.2652242071235
XGBRegressor: 768.1762157046228


In [16]:
print('Root Mean Squared Error \n')

for regressor_name, regressor in regressors:
    
    pipeline = Pipeline(steps=[
              ('preprocessor', preprocessor),
              ('regressor', regressor)])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    
    print(f'{regressor_name}: {sqrt(mse)}')

Root Mean Squared Error 

LinearRegression: 1414.0961910574904
DecisionTreeRegressor: 1331.2020364099951
RandomForestRegressor: 1056.0344885590955
AdaBoostRegressor: 1705.3928592161842
XGBRegressor: 1020.923764770598


**In conclusion, the RandomForestRegressor and XGBRegressor models stand out as the top choices for modeling the relationship between the provided features and the product weight shipped. Their consistent high performance across multiple evaluation metrics makes them strong candidates for your regression modeling task.**

**We have selected the XGBRegressor as the preferred model for our regression task.**

In [17]:
model =  pipeline = Pipeline(steps=[
              ('preprocessor', preprocessor),
              ('XGBRegressor', xgb)])

In [18]:
def prediction(model, X_train):
    
    a = widgets.IntText(value=0, description='Establishment year:')
    
    b = widgets.IntText(value=0, description='No. of Storage issues in last 3 months:')
    
    c = widgets.IntText(value=0, description='No. of Breakdowns in last 3 months:')
    
    d = widgets.Dropdown(options=['Urban', 'Rural'], rows=2 , description='Location Type:')
    
    e = widgets.Dropdown(options=['Small', 'Large', 'Mid'], rows=3 , description='Capacity size:')
    
    f = widgets.Dropdown(options=['Rented', 'Company Owned'], rows=2 , description='Owner type:')
    
    g = widgets.Dropdown(options=['A+', 'A', 'B+', 'B', 'C'], rows=5 , description='Govt. approved Certificate:')
    
    h = widgets.Dropdown(options=['West', 'North', 'South', 'East'] , rows=5 , description=' Zone: ')
    
    i = widgets.Dropdown(options=['Zone 1', 'Zone 2', 'Zone 3', 'Zone 4', 'Zone 5', 'Zone 6'] , rows=6 , description='Regional Zone:')
    
    
    descriptions = [
        a,
        b,
        c,
        d,
        e,
        f,
        g,
        h,
        i
    ]
    
    display(widgets.VBox(descriptions))
    
    def make_prediction(_):
        
        rz = f"{h.value} {i.value}"
        
        user_data = {
            'wh_est_year': [a.value],
            'storage_issue_reported_l3m': [b.value],
            'wh_breakdown_l3m': [c.value],
            'Location_type': [d.value],
            'WH_capacity_size': [e.value],
            'wh_owner_type' : [f.value],
            'approved_wh_govt_certificate': [g.value],
            'Regional_zone': [rz]
        }

        user_df = pd.DataFrame(user_data)

        prediction = model.predict(user_df)

        print(f"Weight of Product to be shipped : {prediction[0]} ton")

    button = widgets.Button(description='Make Prediction')
    button.on_click(make_prediction)
    display(button)

In [19]:
prediction(model, X_train)

VBox(children=(IntText(value=0, description='Establishment year:'), IntText(value=0, description='No. of Stora…

Button(description='Make Prediction', style=ButtonStyle())

Weight of Product to be shipped : 15663.8603515625 ton
