In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif

In [3]:
PATH_DENSITY_REPORT       = '../csv/density_report.csv'
PATH_HISTORICAL_INCIDENTS = '../csv/historical_incidents.csv'
PATH_PRODUCT_ATTRIBUTES   = '../csv/product_attributes.csv'
PATH_SUPPLIER_SCORECARD   = '../csv/supplier_scorecard.csv'

In [4]:
densityReportDF = pd.read_csv(PATH_DENSITY_REPORT, delimiter=";")
productAttributesDF = pd.read_csv(PATH_PRODUCT_ATTRIBUTES, delimiter=";")

In [15]:
reportProductAttributesDF = pd.merge(
    densityReportDF[["ProductReference", "GarmentType", "Material", "Weight", "PackagingQuality"]], 
    productAttributesDF[['ProductReference', 'Size', 'Collection']], 
    on="ProductReference", 
    how='inner'
    )

reportProductAttributesDF

Unnamed: 0,ProductReference,GarmentType,Material,Weight,PackagingQuality,Size,Collection
0,PRD07271,Pants,Polyester,0.35,Good,S,Winter
1,PRD00861,T-Shirt,Denim,0.21,Good,M,Winter
2,PRD05391,Shirt,Cotton,0.20,Good,S,Winter
3,PRD05192,Coat,Cotton,1.30,Good,XL,Winter
4,PRD05735,Coat,Polyester,1.11,Good,M,Autumn
...,...,...,...,...,...,...,...
484634,PRD06239,T-Shirt,Polyester,0.13,Bad,S,Autumn
484635,PRD02248,T-Shirt,Cotton,0.14,Good,L,Winter
484636,PRD07434,Pants,Cotton,0.42,Good,L,Summer
484637,PRD04320,Dress,Cotton,0.51,Good,L,Winter


In [None]:
for col in reportProductAttributesDF.columns:
    if col in ['ProductReference', "PackagingQuality"]:
        continue
    plt.figure()
    if pd.api.types.is_numeric_dtype(reportProductAttributesDF[col]):
        desc = reportProductAttributesDF[col].describe()
        print(f"\nUnivariate summary for {col}:\n{desc}\n")
        sns.histplot(reportProductAttributesDF[col].dropna(), kde=True)
        plt.title(f'Histogram of {col}')
        plt.savefig(f"../EDA/univariate_plots/{col}_hist.png")
    else:
        counts = reportProductAttributesDF[col].value_counts(dropna=False)
        print(f"\nUnivariate counts for {col}:\n{counts}\n")
        counts.plot(kind='bar')
        plt.title(f'Bar chart of {col}')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(f"../EDA/univariate_plots/{col}_bar.png")
    plt.close()



Univariate counts for GarmentType:
GarmentType
Shirt      98754
Pants      73207
Jacket     49847
T-Shirt    46783
Dress      39591
Skirt      39462
Suit       29098
Shorts     25490
Coat       23871
Sweater    22879
Blouse     22786
Hoodie     12871
Name: count, dtype: int64




Univariate counts for Material:
Material
Cotton       191096
Polyester    121339
Linen         48742
Wool          47842
Silk          40578
Denim         35042
Name: count, dtype: int64


Univariate summary for Weight:
count    484639.000000
mean          0.461040
std           0.349811
min           0.080000
25%           0.210000
50%           0.330000
75%           0.620000
max           2.320000
Name: Weight, dtype: float64


Univariate counts for Size:
Size
M     169798
L     147305
S      95942
XL     47981
XS     23613
Name: count, dtype: int64


Univariate counts for Collection:
Collection
Summer    150931
Winter    140278
Spring    108036
Autumn     85394
Name: count, dtype: int64



In [None]:
for col in reportProductAttributesDF.columns:
    if col in ['ProductReference', "PackagingQuality"]:
        continue
    plt.figure()
    if pd.api.types.is_numeric_dtype(reportProductAttributesDF[col]):
        sns.boxplot(x=reportProductAttributesDF["PackagingQuality"], y=reportProductAttributesDF[col])
        plt.title(f'Boxplot of {col} by {"PackagingQuality"}')
        plt.tight_layout()
        plt.savefig(f"../EDA/bivariate_plots/{col}_box.png")
    else:
        ct = pd.crosstab(reportProductAttributesDF[col], reportProductAttributesDF["PackagingQuality"], normalize='index')
        ct.plot(kind='bar', stacked=True)
        plt.title(f'Stacked bar of {col} by {"PackagingQuality"}')
        plt.legend(title="PackagingQuality")
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(f"../EDA/bivariate_plots/{col}_stacked.png")
    plt.close()

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

In [None]:
"""
Compute relevance of attributes to the binary target.
For numeric features: Pearson correlation.
For all features: mutual information.
Handles non-numeric binary targets by encoding them as a pandas Series.
"""
results = []
# Encode target if not numeric, ensure pandas Series
y_raw = reportProductAttributesDF["PackagingQuality"]
if not pd.api.types.is_numeric_dtype(y_raw):
    codes = pd.Categorical(y_raw).codes
    y = pd.Series(codes, index=reportProductAttributesDF.index)
else:
    y = y_raw

# Numeric attributes: Pearson correlation
for col in reportProductAttributesDF.columns:
    if col in ['ProductReference', "PackagingQuality"]:
        continue
    if pd.api.types.is_numeric_dtype(reportProductAttributesDF[col]):
        corr = reportProductAttributesDF[col].corr(y)
        results.append((col, 'correlation', corr))

# Prepare features for mutual information: one-hot encode categorical
X = reportProductAttributesDF.drop(columns=['ProductReference', "PackagingQuality"])
X_enc = pd.get_dummies(X, drop_first=True).fillna(0)

# Mutual information for all attributes
mi_scores = mutual_info_classif(X_enc, y)
mi_series = pd.Series(mi_scores, index=X_enc.columns)

# Aggregate MI per original feature
mi_orig = {}
for feat in X.columns:
    if pd.api.types.is_numeric_dtype(X[feat]):
        # numeric features appear as original names in one-hot encoding
        mi_orig[feat] = mi_series.get(feat, 0)
    else:
        # sum over all dummy columns for categorical feat
        dummy_cols = [c for c in mi_series.index if c.startswith(f"{feat}_")]  
        mi_orig[feat] = mi_series[dummy_cols].sum()

for feat, score in mi_orig.items():
    results.append((feat, 'mutual_info', score))

# Compile results
rel_df = pd.DataFrame(results, columns=['feature', 'method', 'score'])
rel_df = rel_df.sort_values(['method', 'score'], ascending=[True, False])



In [None]:
rel_df.to_csv('../EDA/relevance_scores.csv', index=False)

: 