## Configuration 

In [8]:
#libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [9]:
## Read the CSV files
incidents=pd.read_excel('../data/HistoricalIncidents.xlsx')
attributes=pd.read_excel('../data/ProductAttributes.xlsx')
density=pd.read_excel('../data/DensityReports_500k.xlsx')
scorecard=pd.read_excel('../data/SupplierScorecard.xlsx')

## Density Report

This dataset is the central repository used to evaluate and optimize the packaging process at 
FashionWorld Retail. It contains 500,000 records of packaging reports, with each record representing 
an operational recommendation for how a product should be packaged and a final evaluation of the 
packaging quality.


Key Variables: 
- ReportID: A unique identifier for each report. 
- SupplierName: The name of the supplier involved (three suppliers are considered: SupplierA,SupplierB, and SupplierC). 
- DateOfReport: The date on which the report was generated, ranging between January 1, 2023, and June 30, 2024. 
- GarmentType: The type of garment, categorized into 8 defined groups (e.g., Shirt, Pants,Jacket, Dress, Skirt, Suit, Coat, Sweater). 
- Material: The material of the product, selected from 3 options (Cotton, Polyester, Wool). 
- ProductReference: A unique product code constructed based on the GarmentType and Material. This field serves as a key to link the report with detailed product attributes. 
- ProposedUnitsPerCarton: The recommended number of products per carton designed to optimize packaging efficiency. This value reflects operational considerations and may include some variability. 
- ProposedFoldingMethod: The recommended method for folding the product, classified into three categories (“Method1”, “Method2”, and “Method3”). 
- ProposedLayout: The proposed type of box or layout, now categorized into five distinct options (LayoutA, LayoutB, LayoutC, LayoutD, LayoutE) representing diverse logistical solutions. 
- PackagingQuality: The final label indicating the quality of the packaging, recorded as either “Good” or “Bad”, based on operational criteria linked to product attributes and logistics conditions. 
- DataLabeled: An indicator that confirms every record includes a quality label, which is 
essential for training and evaluating predictive models.

In [10]:
density.head()

Unnamed: 0,ReportID,SupplierName,DateOfReport,GarmentType,Material,ProductReference,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality,DataLabeled
0,1,SuplierB,2023-09-27,Dress,Cotton,DRE-COT-1992,7,Method1,LayoutA,Good,True
1,2,SupplierB,2024-05-30,Coat,Polyester,COA-POL-8528,12,Method2,LayoutB,Bad,True
2,3,SupplierC,2023-07-25,Sweater,Wool,SWE-WOO-2045,14,Method1,LayoutD,Bad,True
3,4,SupplierC,2023-06-16,Dress,Polyester,DRE-POL-3008,17,Method1,LayoutE,Good,True
4,5,SupplierA,2024-06-07,Shirt,Polyester,SHI-POL-5449,20,Method2,LayoutB,Good,True


In [11]:
#info about the data
density.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 11 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   ReportID                500000 non-null  int64         
 1   SupplierName            500000 non-null  object        
 2   DateOfReport            500000 non-null  datetime64[ns]
 3   GarmentType             500000 non-null  object        
 4   Material                500000 non-null  object        
 5   ProductReference        489923 non-null  object        
 6   ProposedUnitsPerCarton  500000 non-null  int64         
 7   ProposedFoldingMethod   500000 non-null  object        
 8   ProposedLayout          490047 non-null  object        
 9   PackagingQuality        500000 non-null  object        
 10  DataLabeled             500000 non-null  bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(7)
memory usage: 38.6+ MB


In [12]:
#description of the data
density.describe()

Unnamed: 0,ReportID,DateOfReport,ProposedUnitsPerCarton
count,500000.0,500000,500000.0
mean,250000.5,2023-09-30 16:10:08.025600768,15.105572
min,1.0,2023-01-01 00:00:00,-20.0
25%,125000.75,2023-05-17 00:00:00,8.0
50%,250000.5,2023-10-01 00:00:00,12.0
75%,375000.25,2024-02-15 00:00:00,17.0
max,500000.0,2024-06-30 00:00:00,200.0
std,144337.711635,,21.782909


In [13]:
density.isnull().sum()

ReportID                      0
SupplierName                  0
DateOfReport                  0
GarmentType                   0
Material                      0
ProductReference          10077
ProposedUnitsPerCarton        0
ProposedFoldingMethod         0
ProposedLayout             9953
PackagingQuality              0
DataLabeled                   0
dtype: int64

In [14]:
#how many suppliers are there?
suppliers = density['SupplierName'].unique()
print(f'There are {len(suppliers)} suppliers in the dataset.')
#print the name of the suppliers
print('The suppliers are:')
for supplier in suppliers:
    print(supplier)
    

There are 6 suppliers in the dataset.
The suppliers are:
SuplierB
SupplierB
SupplierC
SupplierA
SuplierA
SuplierC


In [15]:
# we have some errors in the names so we need to unify them
# unify the names using a dictionary for efficiency
replacements = {
    'SuplierA': 'Supplier A', 'SupplierA': 'Supplier A',
    'SuplierB': 'Supplier B', 'SupplierB': 'Supplier B',
    'SuplierC': 'Supplier C', 'SupplierC': 'Supplier C'
}
density['SupplierName'] = density['SupplierName'].replace(replacements)

# Get the unique suppliers *after* unification
updated_suppliers = density['SupplierName'].unique()

#check if the names are unified
print(f'There are {len(updated_suppliers)} suppliers in the dataset.')
#print the name of the suppliers
print('The suppliers are:')
for supplier in updated_suppliers:
    print(supplier)


There are 3 suppliers in the dataset.
The suppliers are:
Supplier B
Supplier C
Supplier A


In [16]:
density.isnull().sum()

ReportID                      0
SupplierName                  0
DateOfReport                  0
GarmentType                   0
Material                      0
ProductReference          10077
ProposedUnitsPerCarton        0
ProposedFoldingMethod         0
ProposedLayout             9953
PackagingQuality              0
DataLabeled                   0
dtype: int64

In [17]:
#analyse the nulls more closely
#check the nulls in the ProductReference and ProposedLayout column
nulls_product_ref = density[density['ProductReference'].isnull()]
nulls_proposed_layout = density[density['ProposedLayout'].isnull()]



In [18]:
nulls_product_ref

Unnamed: 0,ReportID,SupplierName,DateOfReport,GarmentType,Material,ProductReference,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality,DataLabeled
40,41,Supplier A,2024-06-27,Sweater,Polyester,,8,Method2,LayoutE,Good,True
52,53,Supplier C,2023-10-27,Jacket,Cotton,,14,Method3,LayoutA,Good,True
152,153,Supplier A,2023-12-08,Sweater,Polyester,,15,Method3,LayoutA,Good,True
162,163,Supplier C,2023-07-03,Skirt,Polyester,,11,Method3,LayoutC,Good,True
179,180,Supplier A,2023-01-02,Coat,Wool,,13,Method1,LayoutD,Bad,True
...,...,...,...,...,...,...,...,...,...,...,...
499733,499734,Supplier B,2024-01-24,Coat,Polyester,,7,Method2,LayoutB,Bad,True
499755,499756,Supplier C,2023-10-26,Coat,Polyester,,15,Method1,LayoutC,Good,True
499896,499897,Supplier C,2023-12-21,Shirt,Wool,,6,Unknown,LayoutA,Good,True
499923,499924,Supplier C,2023-05-16,Sweater,Polyester,,6,Method3,LayoutB,Good,True


In [19]:
nulls_proposed_layout

Unnamed: 0,ReportID,SupplierName,DateOfReport,GarmentType,Material,ProductReference,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality,DataLabeled
29,30,Supplier A,2023-09-26,Pants,Wool,PAN-WOO-1587,9,Method1,,Bad,True
98,99,Supplier A,2024-06-02,Coat,Wool,COA-WOO-4651,5,Method2,,Bad,True
126,127,Supplier C,2024-03-02,Jacket,Wool,JAC-WOO-3479,18,Method2,,Bad,True
153,154,Supplier B,2023-11-08,Dress,Cotton,DRE-COT-4766,-12,Method1,,Good,True
259,260,Supplier B,2023-12-19,Coat,Polyester,COA-POL-3402,13,Method3,,Good,True
...,...,...,...,...,...,...,...,...,...,...,...
499781,499782,Supplier A,2023-01-05,Pants,Wool,PAN-WOO-3845,9,Method1,,Good,True
499798,499799,Supplier A,2024-05-19,Dress,Wool,DRE-WOO-8924,8,Method2,,Bad,True
499834,499835,Supplier A,2024-03-13,Sweater,Polyester,SWE-POL-5162,19,Method3,,Good,True
499870,499871,Supplier C,2023-10-23,Jacket,Cotton,JAC-COT-1362,19,Method3,,Good,True


## Product Attributes

This dataset provides detailed information for approximately 5,000 unique products. It is essential for 
understanding how specific product attributes impact packaging quality.

Key Variables: 
- ProductReference: A unique identifier for each product, which aligns with the ProductReference in DensityReports. 
- GarmentType: The type of garment. 
- Material: The material of the product. 
- ProductName: A descriptive name for the product. 
- Size: The size of the product. 
- Collection: The collection to which the product belongs (e.g., Summer or Winter). 
- Weight: The weight of the product, an attribute that may influence packaging decisions.

In [20]:
attributes.head()

Unnamed: 0,ProductReference,GarmentType,Material,ProductName,Size,Collection,Weight
0,DRE-POL-3051,Dress,Polyester,Dress Polyester 3051,XXL,Winter,0.59
1,SWE-COT-8247,Sweater,Cotton,Sweater Cotton 8247,S,Summer,0.31
2,SKI-WOO-2650,Skirt,Wool,Skirt Wool 2650,M,Winter,1.52
3,SUI-POL-3201,Suit,Polyester,Suit Polyester 3201,XL,Winter,1.38
4,SUI-WOO-4038,Suit,Wool,Suit Wool 4038,XXL,Summer,1.1


In [21]:
#info about the data
attributes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ProductReference  5000 non-null   object 
 1   GarmentType       5000 non-null   object 
 2   Material          5000 non-null   object 
 3   ProductName       5000 non-null   object 
 4   Size              5000 non-null   object 
 5   Collection        5000 non-null   object 
 6   Weight            5000 non-null   float64
dtypes: float64(1), object(6)
memory usage: 273.6+ KB


In [22]:
#description of the data
attributes.describe()

Unnamed: 0,Weight
count,5000.0
mean,1.092194
std,0.519381
min,0.2
25%,0.64
50%,1.1
75%,1.55
max,2.0


In [23]:
# do we have any null values?
attributes.isnull().sum()

ProductReference    0
GarmentType         0
Material            0
ProductName         0
Size                0
Collection          0
Weight              0
dtype: int64

## Supplier Scorecard

This dataset captures the performance metrics of the suppliers. It enables the evaluation of supplier 
efficiency and consistency in meeting packaging standards, which may influence the overall quality of 
packaging. 

Key Variables: 
- SupplierName: The supplier’s name, which is used to link supplier performance with the packaging reports. 
- ReportDate: The date when the supplier performance was evaluated. 
- AdherenceScore: A score reflecting the supplier’s adherence to the established packaging standards. 
- NumberOfReminders: The number of reminders sent to the supplier regarding deviations from packaging guidelines. 
- OnTimeDelivery: An indicator of whether the supplier delivered on time, typically recorded as “Yes” or “No”. 
- CostSavings: The cost savings achieved through effective packaging practices. 

In [24]:
scorecard.head()

Unnamed: 0,SupplierName,ReportDate,AdherenceScore,NumberOfReminders,OnTimeDelivery,CostSavings
0,SupplierB,2024-05-30,70,0,Yes,3349.32
1,SupplierB,2023-04-23,54,5,Yes,4006.58
2,SupplierC,2023-02-01,99,1,Yes,737.58
3,SupplierC,2024-05-05,69,5,No,3211.89
4,SupplierB,2023-10-03,68,0,Yes,2302.41


In [25]:
#search an specific date in ReportDate
scorecard[scorecard['ReportDate'] == '2023-06-29'].shape[0]

23

In [26]:
#info about the data
scorecard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   SupplierName       10000 non-null  object        
 1   ReportDate         10000 non-null  datetime64[ns]
 2   AdherenceScore     10000 non-null  int64         
 3   NumberOfReminders  10000 non-null  int64         
 4   OnTimeDelivery     10000 non-null  object        
 5   CostSavings        10000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 468.9+ KB


In [27]:
#description of the data
scorecard.describe()

Unnamed: 0,ReportDate,AdherenceScore,NumberOfReminders,CostSavings
count,10000,10000.0,10000.0,10000.0
mean,2023-09-30 12:24:02.880000256,75.0915,2.5182,2754.948713
min,2023-01-01 00:00:00,50.0,0.0,500.47
25%,2023-05-18 00:00:00,62.0,1.0,1629.685
50%,2023-10-01 00:00:00,75.0,3.0,2757.685
75%,2024-02-14 00:00:00,88.0,4.0,3886.5875
max,2024-06-30 00:00:00,100.0,5.0,4998.54
std,,14.703141,1.706212,1296.358335


In [28]:
scorecard.isnull().sum()

SupplierName         0
ReportDate           0
AdherenceScore       0
NumberOfReminders    0
OnTimeDelivery       0
CostSavings          0
dtype: int64

## Historical Incidents

This dataset documents historical incidents related to packaging issues. The recorded information 
provides insight into past problems, which may help in understanding and predicting which packaging 
operations fail to meet standards.

Key Variables: 
- IncidentID: A unique identifier for each incident. 
- DateOfIncident: The date when the incident occurred. 
- SupplierName: The supplier involved in the incident, linking to the relevant performance data. 
- ProductReference: The product code associated with the incident, used to connect with ProductAttributes and DensityReports. 
- IssueDescription: A description of the encountered problem (e.g., packaging error, damaged product). 
- ResolutionStatus: The current status regarding the resolution of the incident. 
- CostImpact: The economic impact associated with the incident.

In [29]:
incidents.head()

Unnamed: 0,IncidentID,DateOfIncident,SupplierName,ProductReference,IssueDescription,ResolutionStatus,CostImpact
0,1,2023-06-29,SupplierC,SHI-POL-7646,Incorrect units,In Progress,516.77
1,2,2023-03-25,SupplierA,DRE-POL-2824,Incorrect units,In Progress,368.65
2,3,2024-03-16,SupplierB,SKI-WOO-4511,Damaged product,In Progress,784.42
3,4,2023-03-22,SupplierB,COA-WOO-7770,Damaged product,In Progress,770.32
4,5,2023-08-14,SupplierC,SHI-POL-5504,Packaging error,In Progress,140.48


In [30]:
incidents[incidents['DateOfIncident'] == '2023-06-29']

#number of results in DateOfIncident
incidents[incidents['DateOfIncident'] == '2023-06-29'].shape[0]

24

In [31]:
#info about the data
incidents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   IncidentID        10000 non-null  int64         
 1   DateOfIncident    10000 non-null  datetime64[ns]
 2   SupplierName      10000 non-null  object        
 3   ProductReference  10000 non-null  object        
 4   IssueDescription  10000 non-null  object        
 5   ResolutionStatus  10000 non-null  object        
 6   CostImpact        10000 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 547.0+ KB


In [32]:
#description of the data
incidents.describe()

Unnamed: 0,IncidentID,DateOfIncident,CostImpact
count,10000.0,10000,10000.0
mean,5000.5,2023-09-30 12:15:24.480000,547.69505
min,1.0,2023-01-01 00:00:00,100.02
25%,2500.75,2023-05-17 00:00:00,320.7725
50%,5000.5,2023-09-28 12:00:00,545.04
75%,7500.25,2024-02-15 00:00:00,768.785
max,10000.0,2024-06-30 00:00:00,999.89
std,2886.89568,,259.216493


In [33]:
incidents.isnull().sum()

IncidentID          0
DateOfIncident      0
SupplierName        0
ProductReference    0
IssueDescription    0
ResolutionStatus    0
CostImpact          0
dtype: int64