In [359]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [360]:
df = pd.read_csv("garments_worker_productivity.csv")

#EDA

In [361]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   object 
 1   quarter                1197 non-null   object 
 2   department             1197 non-null   object 
 3   day                    1197 non-null   object 
 4   team                   1197 non-null   int64  
 5   targeted_productivity  1197 non-null   float64
 6   smv                    1197 non-null   float64
 7   wip                    691 non-null    float64
 8   over_time              1197 non-null   int64  
 9   incentive              1197 non-null   int64  
 10  idle_time              1197 non-null   float64
 11  idle_men               1197 non-null   int64  
 12  no_of_style_change     1197 non-null   int64  
 13  no_of_workers          1197 non-null   float64
 14  actual_productivity    1197 non-null   float64
dtypes: f

In [362]:
df.head()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382


Data Preprocessing

*   **department:** standardized the spelling from (sweing,finishing ,finishing) to (sewing,finishing)
*   **wip:** (Work in Progress) the nan values are treated as zeros, so all missing values are imputed with 0
*   **date:** changed the data type to datetime then seperated into individual columns (date, month, year).
*   **quarter:** Original column has quarters 1-5 which does not make sense so a new quarter column is made from the month column. The quarter column is not used in modeling because all this data is recorded on the first quarter, it does not give variation.
*   **Outliers in the data:**did not fix because the models used are tree-based
*   **Year:** Dropped because it only contains one value which is 2015

In [363]:
df['wip'] = df['wip'].fillna(0)

In [364]:
print(df.isnull().mean() * 100)

date                     0.0
quarter                  0.0
department               0.0
day                      0.0
team                     0.0
targeted_productivity    0.0
smv                      0.0
wip                      0.0
over_time                0.0
incentive                0.0
idle_time                0.0
idle_men                 0.0
no_of_style_change       0.0
no_of_workers            0.0
actual_productivity      0.0
dtype: float64


In [366]:
print(df['department'].unique())
print(df['date'].unique())
print(df['quarter'].unique())

['sweing' 'finishing ' 'finishing']
['1/1/2015' '1/3/2015' '1/4/2015' '1/5/2015' '1/6/2015' '1/7/2015'
 '1/8/2015' '1/10/2015' '1/11/2015' '1/12/2015' '1/13/2015' '1/14/2015'
 '1/15/2015' '1/17/2015' '1/18/2015' '1/19/2015' '1/20/2015' '1/21/2015'
 '1/22/2015' '1/24/2015' '1/25/2015' '1/26/2015' '1/27/2015' '1/28/2015'
 '1/29/2015' '1/31/2015' '2/1/2015' '2/2/2015' '2/3/2015' '2/4/2015'
 '2/5/2015' '2/7/2015' '2/8/2015' '2/9/2015' '2/10/2015' '2/11/2015'
 '2/12/2015' '2/14/2015' '2/15/2015' '2/16/2015' '2/17/2015' '2/18/2015'
 '2/19/2015' '2/22/2015' '2/23/2015' '2/24/2015' '2/25/2015' '2/26/2015'
 '2/28/2015' '3/1/2015' '3/2/2015' '3/3/2015' '3/4/2015' '3/5/2015'
 '3/7/2015' '3/8/2015' '3/9/2015' '3/10/2015' '3/11/2015']
['Quarter1' 'Quarter2' 'Quarter3' 'Quarter4' 'Quarter5']


In [367]:
df['date'] = pd.to_datetime(df['date'])

In [368]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   date                   1197 non-null   datetime64[ns]
 1   quarter                1197 non-null   object        
 2   department             1197 non-null   object        
 3   day                    1197 non-null   object        
 4   team                   1197 non-null   int64         
 5   targeted_productivity  1197 non-null   float64       
 6   smv                    1197 non-null   float64       
 7   wip                    1197 non-null   float64       
 8   over_time              1197 non-null   int64         
 9   incentive              1197 non-null   int64         
 10  idle_time              1197 non-null   float64       
 11  idle_men               1197 non-null   int64         
 12  no_of_style_change     1197 non-null   int64         
 13  no_

In [369]:
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

In [370]:
df.drop(['date'], axis=1, inplace=True)

In [371]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   quarter                1197 non-null   object 
 1   department             1197 non-null   object 
 2   day                    1197 non-null   int32  
 3   team                   1197 non-null   int64  
 4   targeted_productivity  1197 non-null   float64
 5   smv                    1197 non-null   float64
 6   wip                    1197 non-null   float64
 7   over_time              1197 non-null   int64  
 8   incentive              1197 non-null   int64  
 9   idle_time              1197 non-null   float64
 10  idle_men               1197 non-null   int64  
 11  no_of_style_change     1197 non-null   int64  
 12  no_of_workers          1197 non-null   float64
 13  actual_productivity    1197 non-null   float64
 14  month                  1197 non-null   int32  
 15  year

In [372]:
# derive quarter from the month column
dividing = [
    df['month'].isin([1,2,3]),
    df['month'].isin([4,5,6]),
    df['month'].isin([7,8,9]),
    df['month'].isin([10,11,12])
]

options = [1,2,3,4]
df['new_quarter'] = np.select(dividing, options)
df.drop(['quarter'], axis=1, inplace=True)

In [373]:
print(df['month'].unique())
print(df['new_quarter'].unique())
print(df['year'].unique())

[1 2 3]
[1]
[2015]


In [374]:
#Dropped because there is no variation in the data, only has 1 value
df.drop(['new_quarter', 'year'], axis=1, inplace=True)

In [375]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   department             1197 non-null   object 
 1   day                    1197 non-null   int32  
 2   team                   1197 non-null   int64  
 3   targeted_productivity  1197 non-null   float64
 4   smv                    1197 non-null   float64
 5   wip                    1197 non-null   float64
 6   over_time              1197 non-null   int64  
 7   incentive              1197 non-null   int64  
 8   idle_time              1197 non-null   float64
 9   idle_men               1197 non-null   int64  
 10  no_of_style_change     1197 non-null   int64  
 11  no_of_workers          1197 non-null   float64
 12  actual_productivity    1197 non-null   float64
 13  month                  1197 non-null   int32  
dtypes: float64(6), int32(2), int64(5), object(1)
memory usag

In [376]:
df['department'].replace('sweing', 'sewing', inplace=True)
df['department'].replace('finishing ', 'finishing', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['department'].replace('sweing', 'sewing', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['department'].replace('finishing ', 'finishing', inplace=True)


In [377]:
df['department'].unique()

array(['sewing', 'finishing'], dtype=object)

In [378]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   department             1197 non-null   object 
 1   day                    1197 non-null   int32  
 2   team                   1197 non-null   int64  
 3   targeted_productivity  1197 non-null   float64
 4   smv                    1197 non-null   float64
 5   wip                    1197 non-null   float64
 6   over_time              1197 non-null   int64  
 7   incentive              1197 non-null   int64  
 8   idle_time              1197 non-null   float64
 9   idle_men               1197 non-null   int64  
 10  no_of_style_change     1197 non-null   int64  
 11  no_of_workers          1197 non-null   float64
 12  actual_productivity    1197 non-null   float64
 13  month                  1197 non-null   int32  
dtypes: float64(6), int32(2), int64(5), object(1)
memory usag

In [379]:
X = df.drop('actual_productivity', axis=1)
y = df['actual_productivity']

#Pipeline

In [380]:
from sklearn.model_selection import train_test_split

In [381]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [382]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   department             1197 non-null   object 
 1   day                    1197 non-null   int32  
 2   team                   1197 non-null   int64  
 3   targeted_productivity  1197 non-null   float64
 4   smv                    1197 non-null   float64
 5   wip                    1197 non-null   float64
 6   over_time              1197 non-null   int64  
 7   incentive              1197 non-null   int64  
 8   idle_time              1197 non-null   float64
 9   idle_men               1197 non-null   int64  
 10  no_of_style_change     1197 non-null   int64  
 11  no_of_workers          1197 non-null   float64
 12  month                  1197 non-null   int32  
dtypes: float64(5), int32(2), int64(5), object(1)
memory usage: 112.3+ KB


In [383]:
num_columns = ['team', 'targeted_productivity', 'smv', 'wip', 'over_time', 'incentive', 'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers', 'month']
cat_columns = ['department']

In [384]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

In [385]:
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ])

category_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop ='first', handle_unknown ='ignore')),
    ])


In [386]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
        ('cat', category_transformer, cat_columns)])

In [387]:
preprocessor

In [419]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Random Forest Regressor
*best model out of the three*

In [394]:
from sklearn.ensemble import RandomForestRegressor

estimator2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])
estimator2.fit(X_train, y_train)


In [395]:
y_pred = estimator2.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
print(f"MAE:", mean_absolute_error(y_test, y_pred))
print(f"RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

MSE: 0.011274017348087808
R²: 0.5754060234645721
MAE: 0.06696062184049113
RMSE: 0.1061791756800165


MSE: 0.011227425166301763
R²: 0.5771607448854464
MAE: 0.06784419290390839
RMSE: 0.10595954495137172

# XGBoost Regressor

In [396]:
from xgboost import XGBRegressor

In [397]:
estimator_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, random_state=42))
])
estimator_xgb.fit(X_train, y_train)

In [398]:
y_pred = estimator_xgb.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
print(f"MAE:", mean_absolute_error(y_test, y_pred))
print(f"RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

MSE: 0.012352116927832835
R²: 0.534803408306919
MAE: 0.06954879979341756
RMSE: 0.11114007795495212


#Hyperparamter Tuning

In [399]:
from sklearn.model_selection import GridSearchCV

In [412]:
param_rfr = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 5, 10]
}

grid_rfr = GridSearchCV(estimator2, param_grid=param_rfr, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

grid_rfr.fit(X_train, y_train)


Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [413]:
print("Best Parameters:", grid_rfr.best_params_)
print("Best CV Score (negative MSE):", grid_rfr.best_score_)


Best Parameters: {'regressor__max_depth': 10, 'regressor__n_estimators': 200}
Best CV Score (negative MSE): -0.0168732760118963


In [414]:
best_rfr = grid_rfr.best_estimator_

In [416]:
y_pred = best_rfr.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
print(f"MAE:", mean_absolute_error(y_test, y_pred))
print(f"RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

MSE: 0.011227425166301763
R²: 0.5771607448854464
MAE: 0.06784419290390839
RMSE: 0.10595954495137172


The model shows good accuracy, reflected by the low values of MSE, MAE, and RMSE, and it explains about 58% of the variance in the data.

Pickling

In [417]:
import pickle

In [418]:
with open('trained_model.pkl', 'wb') as file:
    pickle.dump(best_rfr, file)