### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.covariance import EmpiricalCovariance
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

### Globals

In [3]:
original_data_path = '../data/original/'
density_reports_filepath = original_data_path + 'DensityReports.xlsx'
historical_incidents_filepath = original_data_path + 'HistoricalIncidents.xlsx'
product_attributes_filepath = original_data_path + 'ProductAttributes.xlsx'
supplier_scorecard_filepath = original_data_path + 'SupplierScorecard.xlsx'

### Load Data

In [4]:
density_reports = pd.read_excel(density_reports_filepath)
product_attributes = pd.read_excel(product_attributes_filepath)
supplier_scorecard = pd.read_excel(supplier_scorecard_filepath)


## Data Preparation

### Basic Transformations

In [5]:
density_reports['PackagingQuality'].value_counts()

PackagingQuality
Good         397338
Bad           97017
GOOD           1914
bad            1868
Uncertain      1863
Name: count, dtype: int64

In [6]:
unique_packaging_quality = density_reports['PackagingQuality'].unique()
print(unique_packaging_quality)

density_reports['PackagingQuality'] = density_reports['PackagingQuality'].str.upper()
unique_packaging_quality = density_reports['PackagingQuality'].unique()
print(unique_packaging_quality)

['Good' 'Bad' 'GOOD' 'Uncertain' 'bad']
['GOOD' 'BAD' 'UNCERTAIN']


In [7]:
density_reports['SupplierName'].value_counts()

SupplierName
SupplierA    138607
SupplierB    110723
SupplierC     74490
SupplierD     53416
SupplierE     40662
SupplierF     32845
SupplierG     25510
SupplierH     13744
supplierA      1721
SupllierC      1714
supplierh      1674
SPLF           1642
SuplA          1629
SuppB          1623
Name: count, dtype: int64

In [8]:
unique_supplier = density_reports['SupplierName'].unique()
print(unique_supplier)

# Define the mappings
supplier_mappings = {
    'SupplierA': 'A',
    'supplierA': 'A',
    'SuplA': 'A',
    'SupplierB': 'B',
    'SuppB': 'B',
    'SupplierC': 'C',
    'SupllierC': 'C',
    'SupplierD': 'D',
    'SupplierE': 'E',
    'SupplierF': 'F',
    'SPLF': 'F',
    'SupplierG': 'G',
    'supplierh': 'H',
    'SupplierH': 'H'
}
# Apply the mapping to standardize SupplierName
density_reports['SupplierName'] = density_reports['SupplierName'].map(supplier_mappings).fillna(density_reports['SupplierName'])

unique_supplier = density_reports['SupplierName'].unique()
print(unique_supplier)

['SupplierA' 'SupplierC' 'SupplierD' 'SupplierB' 'supplierA' 'SupplierF'
 'SupplierE' 'supplierh' 'SupplierH' 'SuppB' 'SupplierG' 'SupllierC'
 'SPLF' 'SuplA']
['A' 'C' 'D' 'B' 'F' 'E' 'H' 'G']


In [9]:
density_reports['SupplierName'].value_counts()

SupplierName
A    141957
B    112346
C     76204
D     53416
E     40662
F     34487
G     25510
H     15418
Name: count, dtype: int64

In [10]:
density_reports['ProposedFoldingMethod'].value_counts()

ProposedFoldingMethod
Method2     218201
Method1     157652
Method3     114363
Methd1        2450
FoldX         2426
Method_2      2394
Name: count, dtype: int64

In [11]:
unique_fold_method = density_reports['ProposedFoldingMethod'].unique()
print(unique_fold_method)

# Define the mappings
fold_method_mappings = {
    'Method1': 'Method1',
    'Methd1': 'Method1',
    'Method2': 'Method2',
    'Method_2': 'Method2',
    'Method3': 'Method3',
    'FoldX': 'MethodX'
}
# Apply the mapping to standardize SupplierName
density_reports['ProposedFoldingMethod'] = density_reports['ProposedFoldingMethod'].map(fold_method_mappings).fillna(density_reports['ProposedFoldingMethod'])

# Drop rows with NaN values in 'ProposedFoldingMethod'
density_reports = density_reports.dropna(subset=['ProposedFoldingMethod'])

unique_fold_method = density_reports['ProposedFoldingMethod'].unique()
print(unique_fold_method)

['Method2' 'Method1' 'Method3' 'Method_2' nan 'Methd1' 'FoldX']
['Method2' 'Method1' 'Method3' 'MethodX']


In [12]:
density_reports['ProposedFoldingMethod'].value_counts()

ProposedFoldingMethod
Method2    220595
Method1    160102
Method3    114363
MethodX      2426
Name: count, dtype: int64

In [13]:
density_reports['ProposedLayout'].value_counts()

ProposedLayout
LayoutB    179446
LayoutC    138517
LayoutA     83629
LayoutD     64326
LayoutE     21318
LayC         2651
Box9         2564
LayoutX      2537
layouta      2498
Name: count, dtype: int64

In [14]:
unique_layout = density_reports['ProposedLayout'].unique()
print(unique_layout)

# Define the mappings
layout_mappings = {
    'LayoutA': 'LayoutA',
    'layouta': 'LayoutA',
    'LayoutB': 'LayoutB',
    'LayoutC': 'LayoutC',
    'LayC': 'LayoutC',
    'LayoutD': 'LayoutD',
    'LayoutE': 'LayoutE',
    'LayoutX': 'LayoutX',
    'Box9': 'LayoutX',
}
# Apply the mapping to standardize SupplierName
density_reports['ProposedLayout'] = density_reports['ProposedLayout'].map(layout_mappings).fillna(density_reports['ProposedLayout'])

# Drop rows with NaN values in 'ProposedLayout'
density_reports = density_reports.dropna(subset=['ProposedLayout'])

unique_layout = density_reports['ProposedLayout'].unique()
print(unique_layout)

['LayoutC' 'LayoutB' 'LayoutA' 'LayoutD' 'LayoutE' 'LayoutX' 'Box9'
 'layouta' 'LayC']
['LayoutC' 'LayoutB' 'LayoutA' 'LayoutD' 'LayoutE' 'LayoutX']


In [15]:
density_reports['ProposedLayout'].value_counts()

ProposedLayout
LayoutB    179446
LayoutC    141168
LayoutA     86127
LayoutD     64326
LayoutE     21318
LayoutX      5101
Name: count, dtype: int64

In [16]:
unique_garment_type = density_reports['GarmentType'].unique()
print(unique_garment_type)

['Pants' 'T-Shirt' 'Shirt' 'Coat' 'Dress' 'Blouse' 'Suit' 'Hoodie' 'Skirt'
 'Jacket' 'Shorts' 'Sweater']


In [17]:
unique_material = density_reports['Material'].unique()
print(unique_material)

['Polyester' 'Denim' 'Cotton' 'Linen' 'Wool' 'Silk']


### Join Density Reports and Product Attributes Data

In [18]:
report_product_attributes = pd.merge(
    density_reports[["ProductReference", "SupplierName", "GarmentType", "Material", 
                     "Weight", "ProposedUnitsPerCarton", "ProposedFoldingMethod", 
                     "ProposedLayout", "PackagingQuality"
                     ]], 
    product_attributes[['ProductReference', 'Size', 'Collection']], 
    on="ProductReference", 
    how='inner'
    )

In [19]:
report_product_attributes.head()

Unnamed: 0,ProductReference,SupplierName,GarmentType,Material,Weight,ProposedUnitsPerCarton,ProposedFoldingMethod,ProposedLayout,PackagingQuality,Size,Collection
0,PRD07271,A,Pants,Polyester,0.35,29.0,Method2,LayoutC,GOOD,S,Winter
1,PRD00861,C,T-Shirt,Denim,0.21,20.0,Method2,LayoutB,GOOD,M,Winter
2,PRD05391,A,Shirt,Cotton,0.2,31.0,Method1,LayoutA,GOOD,S,Winter
3,PRD05192,A,Coat,Cotton,1.3,5.0,Method1,LayoutD,GOOD,XL,Winter
4,PRD05735,A,Coat,Polyester,1.11,9.0,Method2,LayoutD,GOOD,M,Autumn


In [20]:
report_product_attributes.dtypes

ProductReference           object
SupplierName               object
GarmentType                object
Material                   object
Weight                    float64
ProposedUnitsPerCarton    float64
ProposedFoldingMethod      object
ProposedLayout             object
PackagingQuality           object
Size                       object
Collection                 object
dtype: object

In [21]:
unique_collection = report_product_attributes['Collection'].unique()
print(unique_collection)

['Winter' 'Autumn' 'Summer' 'Spring']


In [22]:
unique_size = report_product_attributes['Size'].unique()
print(unique_size)

['S' 'M' 'XL' 'L' 'XS']


### Association Analysis

In [23]:
df = report_product_attributes.copy()

In [24]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2, p, dof, expected = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    min_dim = min(confusion_matrix.shape) - 1
    return np.sqrt(chi2 / (n * min_dim))


In [25]:
categorical_columns = ['SupplierName', 'GarmentType', 
                       'Material', 'ProposedFoldingMethod', 
                       'ProposedLayout', 'Size', 'Collection']
results = []

for i, col1 in enumerate(categorical_columns):
    for col2 in categorical_columns[i+1:]:
        v = cramers_v(df[col1], df[col2])
        results.append({'Var1': col1, 'Var2': col2, "CramersV": v})

cramers_v_df = pd.DataFrame(results).sort_values(by='CramersV', ascending=False)
cramers_v_df

Unnamed: 0,Var1,Var2,CramersV
11,Material,ProposedFoldingMethod,0.564221
8,GarmentType,ProposedLayout,0.461266
12,Material,ProposedLayout,0.085167
9,GarmentType,Size,0.0289
14,Material,Collection,0.028399
6,GarmentType,Material,0.028119
10,GarmentType,Collection,0.027595
15,ProposedFoldingMethod,ProposedLayout,0.024207
13,Material,Size,0.022407
20,Size,Collection,0.021282


### Mutual Information Analysis

In [26]:
df = report_product_attributes.copy()

# Separate features and target, remove unnecessary columns
X = df.drop(columns=['PackagingQuality', 'ProductReference'])
y = df['PackagingQuality']

In [27]:
# Label encode all categorical variables (both features and target)
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)

X_encoded = X.copy()
discrete_features = []

for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X[col])
        discrete_features.append(True)
    else:
        discrete_features.append(False)  # float columns

# Compute mutual information
mi_scores = mutual_info_classif(X_encoded, y_encoded, discrete_features=discrete_features)

# Present results
mi_df = pd.DataFrame({
    'Feature': X.columns,
    'MutualInformation': mi_scores
}).sort_values(by='MutualInformation', ascending=False)

print(mi_df)

                  Feature  MutualInformation
0            SupplierName           0.032370
2                Material           0.004636
5   ProposedFoldingMethod           0.003300
3                  Weight           0.003268
4  ProposedUnitsPerCarton           0.002094
1             GarmentType           0.001059
6          ProposedLayout           0.001041
7                    Size           0.000010
8              Collection           0.000004
