In [1]:
import pandas as pd
from pandasql import PandaSQL
import warnings 
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

In [2]:
productCatalogDf = pd.read_csv('../data/product_catalog_cleaned.csv')

In [3]:
productCatalogDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308 entries, 0 to 307
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Manufacturernumber          308 non-null    object 
 1   Articlenumber               308 non-null    object 
 2   EAN                         282 non-null    float64
 3   Technical details           304 non-null    object 
 4   Picture normal reduced      304 non-null    object 
 5   Depth m                     264 non-null    float64
 6   Width m                     264 non-null    float64
 7   Length m                    264 non-null    float64
 8   Weight kg                   304 non-null    float64
 9   Delivery time days          4 non-null      float64
 10  Type of product             241 non-null    object 
 11  Price quantity              308 non-null    int64  
 12  ETIM Features               44 non-null     object 
 13  ETIM                        44 non-

In [4]:
productCatalogDf.head(2)

Unnamed: 0,Manufacturernumber,Articlenumber,EAN,Technical details,Picture normal reduced,Depth m,Width m,Length m,Weight kg,Delivery time days,...,Short description 2,Long description,Language,Manufacturername,total_good,total_bad,Manufacturer_product_count,Product_length_category,Volume_m3,Description_length
0,0 601 6B4 000,06016B4000,,§Titel§Akku-Tauchsäge BITURBO GKT 18V-52 GC Pr...,'https://www.nexmart.com/media/catalog/ampshar...,0.254,0.36,0.444,4.032,,...,GKT 18V-52 GC (L) solo CLC,"Akku-Tauchsäge BITURBO GKT 18V-52 GC, Die Akku...",de,BOSCH,13,4,140,Medium,0.040599,1640
1,0 601 6B4 000,06016B4000,,§Titel§Akku-Tauchsäge BITURBO GKT 18V-52 GC Pr...,'https://www.nexmart.com/media/catalog/ampshar...,0.254,0.36,0.444,4.032,,...,,,en,BOSCH,11,6,140,Medium,0.040599,0


In [5]:
productCatalogDf.tail(2)

Unnamed: 0,Manufacturernumber,Articlenumber,EAN,Technical details,Picture normal reduced,Depth m,Width m,Length m,Weight kg,Delivery time days,...,Short description 2,Long description,Language,Manufacturername,total_good,total_bad,Manufacturer_product_count,Product_length_category,Volume_m3,Description_length
306,RALB2EU,RALB2EU,4012079000000.0,§Nennspannung§18§V|§Kapazität§5§Ah|§Ausführung...,'https://www.nexmart.com/media/catalog/ampshar...,,,,0.64,,...,,"Bosch Li-Ion Akku 18V/5Ah Akku, geeignet für d...",de,BOSCH,11,6,140,,1.0,110
307,RALB2US,RALB2US,4012079000000.0,§Nennspannung§18§V|§Kapazität§5§Ah|§Ausführung...,'https://www.nexmart.com/media/catalog/ampshar...,,,,0.64,,...,,"Bosch Akku 18V /5,0 Ah Li-Ion 1607A3502E",de,BOSCH,11,6,140,,1.0,40


Which manufacturers have the biggest improvement potential in their data quality in absolute and relative numbers?

In [6]:
query = """
SELECT
    Manufacturername,
    COUNT(*) AS total_products,

    -- Count of bad values in selected critical fields
    SUM(CASE WHEN [Short description] IS NULL THEN 1 ELSE 0 END) AS bad_short_desc,
    SUM(CASE WHEN [Short description 2] IS NULL THEN 1 ELSE 0 END) AS bad_short_desc_2,
    SUM(CASE WHEN [Long description] IS NULL THEN 1 ELSE 0 END) AS missing_long_desc,
    SUM(CASE WHEN EAN IS NULL OR TRIM(EAN) = '' THEN 1 ELSE 0 END) AS missing_ean,
    SUM(CASE WHEN [Picture normal reduced] IS NULL OR TRIM([Picture normal reduced]) = '' THEN 1 ELSE 0 END) AS missing_picture_url,
    SUM(CASE WHEN Volume_m3 IS NULL OR Volume_m3 = 0 THEN 1 ELSE 0 END) AS missing_weight,
    SUM(CASE WHEN [Depth m] IS NULL OR [Depth m] = 0 THEN 1 ELSE 0 END) AS missing_depth,
    SUM(CASE WHEN [Width m] IS NULL OR [Width m] = 0 THEN 1 ELSE 0 END) AS missing_width,
    SUM(CASE WHEN [Length m] IS NULL OR [Length m] = 0 THEN 1 ELSE 0 END) AS missing_length,

    -- Total bad fields
    (
        SUM(CASE WHEN [Short description] IS NULL THEN 1 ELSE 0 END)+
        SUM(CASE WHEN [Short description 2] IS NULL THEN 1 ELSE 0 END)+
        SUM(CASE WHEN [Long description] IS NULL THEN 1 ELSE 0 END)+
        SUM(CASE WHEN EAN IS NULL OR TRIM(EAN) = '' THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Picture normal reduced] IS NULL OR TRIM([Picture normal reduced]) = '' THEN 1 ELSE 0 END) +
        SUM(CASE WHEN Volume_m3 IS NULL OR Volume_m3 = 0 THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Depth m] IS NULL OR [Depth m] = 0 THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Width m] IS NULL OR [Width m] = 0 THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Length m] IS NULL OR [Length m] = 0 THEN 1 ELSE 0 END)
    ) AS total_bad_fields

FROM productCatalogDf
GROUP BY Manufacturername
ORDER BY total_bad_fields DESC;
"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

Unnamed: 0,Manufacturername,total_products,bad_short_desc,bad_short_desc_2,missing_long_desc,missing_ean,missing_picture_url,missing_weight,missing_depth,missing_width,missing_length,total_bad_fields
0,GUSTAV KLAUKE GMBH,40,0,40,0,0,4,0,40,40,40,164
1,FEIN,100,0,100,0,0,0,3,3,3,3,112
2,BOSCH,140,4,8,28,26,0,0,4,4,4,78
3,ROTHENBERGER,23,0,23,0,0,0,0,0,0,0,23
4,FISCHER,5,0,0,0,0,0,0,0,0,0,0


In [7]:
query = """
SELECT
    Manufacturername,
    COUNT(*) AS total_products,

    -- Count of bad values in selected critical fields
    SUM(CASE WHEN [Short description] IS NULL THEN 1 ELSE 0 END) AS bad_short_desc,
    SUM(CASE WHEN [Short description 2] IS NULL THEN 1 ELSE 0 END) AS bad_short_desc_2,
    SUM(CASE WHEN [Long description] IS NULL THEN 1 ELSE 0 END) AS missing_long_desc,
    SUM(CASE WHEN EAN IS NULL OR TRIM(EAN) = '' THEN 1 ELSE 0 END) AS missing_ean,
    SUM(CASE WHEN [Picture normal reduced] IS NULL OR TRIM([Picture normal reduced]) = '' THEN 1 ELSE 0 END) AS missing_picture_url,
    SUM(CASE WHEN Volume_m3 IS NULL OR Volume_m3 = 0 THEN 1 ELSE 0 END) AS missing_weight,
    SUM(CASE WHEN [Depth m] IS NULL OR [Depth m] = 0 THEN 1 ELSE 0 END) AS missing_depth,
    SUM(CASE WHEN [Width m] IS NULL OR [Width m] = 0 THEN 1 ELSE 0 END) AS missing_width,
    SUM(CASE WHEN [Length m] IS NULL OR [Length m] = 0 THEN 1 ELSE 0 END) AS missing_length,
   

    -- Total bad fields
    (
        SUM(CASE WHEN [Short description] IS NULL THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Short description 2] IS NULL THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Long description] IS NULL THEN 1 ELSE 0 END) +
        SUM(CASE WHEN EAN IS NULL OR TRIM(EAN) = '' THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Picture normal reduced] IS NULL OR TRIM([Picture normal reduced]) = '' THEN 1 ELSE 0 END) +
        SUM(CASE WHEN Volume_m3 IS NULL OR Volume_m3 = 0 THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Depth m] IS NULL OR [Depth m] = 0 THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Width m] IS NULL OR [Width m] = 0 THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Length m] IS NULL OR [Length m] = 0 THEN 1 ELSE 0 END)
        
    ) AS total_bad_fields,

    -- Relative bad field percentage 
    ROUND(
        (
            (
                SUM(CASE WHEN [Short description] IS NULL THEN 1 ELSE 0 END) +
                SUM(CASE WHEN [Short description 2] IS NULL THEN 1 ELSE 0 END) +
                SUM(CASE WHEN [Long description] IS NULL THEN 1 ELSE 0 END) +
                SUM(CASE WHEN EAN IS NULL OR TRIM(EAN) = '' THEN 1 ELSE 0 END) +
                SUM(CASE WHEN [Picture normal reduced] IS NULL OR TRIM([Picture normal reduced]) = '' THEN 1 ELSE 0 END) +
                SUM(CASE WHEN Volume_m3 IS NULL OR Volume_m3 = 0 THEN 1 ELSE 0 END) +
                SUM(CASE WHEN [Depth m] IS NULL OR [Depth m] = 0 THEN 1 ELSE 0 END) +
                SUM(CASE WHEN [Width m] IS NULL OR [Width m] = 0 THEN 1 ELSE 0 END) +
                SUM(CASE WHEN [Length m] IS NULL OR [Length m] = 0 THEN 1 ELSE 0 END)
                
                
            ) * 1.0
            /
            (COUNT(*) * 10)
        ) * 100, 2
    ) AS pct_bad_fields

FROM productCatalogDf
GROUP BY Manufacturername
ORDER BY pct_bad_fields DESC;
"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

Unnamed: 0,Manufacturername,total_products,bad_short_desc,bad_short_desc_2,missing_long_desc,missing_ean,missing_picture_url,missing_weight,missing_depth,missing_width,missing_length,total_bad_fields,pct_bad_fields
0,GUSTAV KLAUKE GMBH,40,0,40,0,0,4,0,40,40,40,164,41.0
1,FEIN,100,0,100,0,0,0,3,3,3,3,112,11.2
2,ROTHENBERGER,23,0,23,0,0,0,0,0,0,0,23,10.0
3,BOSCH,140,4,8,28,26,0,0,4,4,4,78,5.57
4,FISCHER,5,0,0,0,0,0,0,0,0,0,0,0.0


What product variable/column (description or property) usually contains data of good quality per manufacturer? And what is the % of good quality records per variable/column and manufacturer?

In [9]:
query = """
SELECT
    Manufacturername,

    SUM(CASE WHEN [Short description] IS NOT NULL AND TRIM([Short description]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) AS cnt_good_short_description,
    SUM(CASE WHEN [Short description 2] IS NOT NULL AND TRIM([Short description 2]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) AS cnt_good_short_description_2,
    SUM(CASE WHEN [Long description] IS NOT NULL AND TRIM([Long description]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) AS cnt_good_long_description,
    SUM(CASE WHEN [Technical details] IS NOT NULL AND TRIM([Technical details]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) AS cnt_good_technical_details,
    SUM(CASE WHEN EAN IS NOT NULL AND TRIM(EAN) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) AS cnt_good_ean,
    SUM(CASE WHEN [Picture normal reduced] IS NOT NULL AND TRIM([Picture normal reduced]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) AS cnt_good_picture_url,
    SUM(CASE WHEN [Weight kg] IS NOT NULL AND [Weight kg] > 0 THEN 1 ELSE 0 END) AS cnt_good_weight,
    SUM(CASE WHEN [Length m] IS NOT NULL AND [Width m] IS NOT NULL AND [Depth m] IS NOT NULL
                  AND [Length m] > 0 AND [Width m] > 0 AND [Depth m] > 0 THEN 1 ELSE 0 END) AS cnt_good_dimensions,

    (
        SUM(CASE WHEN [Short description] IS NOT NULL AND TRIM([Short description]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END)  +
        SUM(CASE WHEN [Short description 2] IS NOT NULL AND TRIM([Short description 2]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Long description] IS NOT NULL AND TRIM([Long description]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Technical details] IS NOT NULL AND TRIM([Technical details]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        SUM(CASE WHEN EAN IS NOT NULL AND TRIM(EAN) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Picture normal reduced] IS NOT NULL AND TRIM([Picture normal reduced]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Weight kg] IS NOT NULL AND [Weight kg] > 0 THEN 1 ELSE 0 END) +
        SUM(CASE WHEN [Length m] IS NOT NULL AND [Width m] IS NOT NULL AND [Depth m] IS NOT NULL
                      AND [Length m] > 0 AND [Width m] > 0 AND [Depth m] > 0 THEN 1 ELSE 0 END)
    ) AS total

FROM productCatalogDf
GROUP BY Manufacturername
ORDER BY Manufacturername;

"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

Unnamed: 0,Manufacturername,cnt_good_short_description,cnt_good_short_description_2,cnt_good_long_description,cnt_good_technical_details,cnt_good_ean,cnt_good_picture_url,cnt_good_weight,cnt_good_dimensions,total
0,BOSCH,136,132,112,136,114,140,140,136,1046
1,FEIN,100,0,100,100,100,100,96,97,693
2,FISCHER,5,5,5,5,5,5,5,5,40
3,GUSTAV KLAUKE GMBH,40,0,40,40,40,36,40,0,236
4,ROTHENBERGER,23,0,23,23,23,23,23,23,161


In [11]:
query = """
SELECT
    Manufacturername,

    -- % of good Short description
    ROUND(SUM(CASE WHEN [Short description] IS NOT NULL AND TRIM([Short description]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_good_short_description,

    -- % of good Short description 2
    ROUND(SUM(CASE WHEN [Short description 2] IS NOT NULL AND TRIM([Short description 2]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_good_short_description_2,

    -- % of good Long description
    ROUND(SUM(CASE WHEN [Long description] IS NOT NULL AND TRIM([Long description]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_good_long_description,

    -- % of good Technical details
    ROUND(SUM(CASE WHEN [Technical details] IS NOT NULL AND TRIM([Technical details]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_good_technical_details,

    -- % of good EAN
    ROUND(SUM(CASE WHEN EAN IS NOT NULL AND TRIM(EAN) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_good_ean,

    -- % with valid image URL
    ROUND(SUM(CASE WHEN [Picture normal reduced] IS NOT NULL AND TRIM([Picture normal reduced]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_good_picture_url,

    -- % of good weight
    ROUND(SUM(CASE WHEN [Weight kg] IS NOT NULL AND [Weight kg] > 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_good_weight,

    -- % with full non-zero dimensions
    ROUND(SUM(CASE WHEN [Length m] IS NOT NULL AND [Width m] IS NOT NULL AND [Depth m] IS NOT NULL
                  AND [Length m] > 0 AND [Width m] > 0 AND [Depth m] > 0 THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_good_dimensions

FROM productCatalogDf
GROUP BY  Manufacturername
ORDER BY Manufacturername;

"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

Unnamed: 0,Manufacturername,pct_good_short_description,pct_good_short_description_2,pct_good_long_description,pct_good_technical_details,pct_good_ean,pct_good_picture_url,pct_good_weight,pct_good_dimensions
0,BOSCH,97.1,94.3,80.0,97.1,81.4,100.0,100.0,97.1
1,FEIN,100.0,0.0,100.0,100.0,100.0,100.0,96.0,97.0
2,FISCHER,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
3,GUSTAV KLAUKE GMBH,100.0,0.0,100.0,100.0,100.0,90.0,100.0,0.0
4,ROTHENBERGER,100.0,0.0,100.0,100.0,100.0,100.0,100.0,100.0


Which manufacturers have the most missing or low-quality data per column?
This version gives a row per manufacturer and column-wise missing value counts and percentages.

In [14]:
query = """
SELECT
    Manufacturername,
    COUNT(*) AS total_products,

    -- Missing Short Description
    SUM(CASE WHEN [Short description] IS NULL OR TRIM([Short description]) IN ('', 'N/A') THEN 1 ELSE 0 END) AS missing_short_description,
    ROUND(SUM(CASE WHEN [Short description] IS NULL OR TRIM([Short description]) IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_missing_short_description,

    -- Missing Long Description
    SUM(CASE WHEN [Long description] IS NULL OR TRIM([Long description]) IN ('', 'N/A') THEN 1 ELSE 0 END) AS missing_long_description,
    ROUND(SUM(CASE WHEN [Long description] IS NULL OR TRIM([Long description]) IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_missing_long_description,

    -- Missing Short Description 2
    SUM(CASE WHEN [Short description 2] IS NULL OR TRIM([Short description 2]) IN ('', 'N/A') THEN 1 ELSE 0 END) AS missing_short_description_2,
    ROUND(SUM(CASE WHEN [Short description 2] IS NULL OR TRIM([Short description 2]) IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_missing_short_description_2,

    -- Missing EAN
    SUM(CASE WHEN EAN IS NULL OR TRIM(EAN) IN ('', 'N/A') THEN 1 ELSE 0 END) AS missing_ean,
    ROUND(SUM(CASE WHEN EAN IS NULL OR TRIM(EAN) IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_missing_ean,

    -- Missing Technical details
    SUM(CASE WHEN [Technical details] IS NULL OR TRIM([Technical details]) IN ('', 'N/A') THEN 1 ELSE 0 END) AS missing_technical_details,
    ROUND(SUM(CASE WHEN [Technical details] IS NULL OR TRIM([Technical details]) IN ('', 'N/A') THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS pct_missing_technical_details

FROM productCatalogDf
GROUP BY Manufacturername
ORDER BY pct_missing_short_description DESC;
"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

Unnamed: 0,Manufacturername,total_products,missing_short_description,pct_missing_short_description,missing_long_description,pct_missing_long_description,missing_short_description_2,pct_missing_short_description_2,missing_ean,pct_missing_ean,missing_technical_details,pct_missing_technical_details
0,BOSCH,140,4,2.9,28,20.0,8,5.7,26,18.6,4,2.9
1,ROTHENBERGER,23,0,0.0,0,0.0,23,100.0,0,0.0,0,0.0
2,GUSTAV KLAUKE GMBH,40,0,0.0,0,0.0,40,100.0,0,0.0,0,0.0
3,FISCHER,5,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,FEIN,100,0,0.0,0,0.0,100,100.0,0,0.0,0,0.0


Completeness of Data

In [16]:
query = """
SELECT
   Manufacturername,
  (
    -- Count of filled fields
    (
      CASE WHEN EAN IS NOT NULL AND EAN <> '' THEN 1 ELSE 0 END +
      CASE WHEN [Technical details] IS NOT NULL AND [Technical details] <> '' THEN 1 ELSE 0 END +
      CASE WHEN [Depth m] IS NOT NULL THEN 1 ELSE 0 END +
      CASE WHEN [Width m] IS NOT NULL THEN 1 ELSE 0 END +
      CASE WHEN [Length m] IS NOT NULL THEN 1 ELSE 0 END +
      CASE WHEN [Weight kg] IS NOT NULL THEN 1 ELSE 0 END +
      CASE WHEN [Delivery time days] IS NOT NULL THEN 1 ELSE 0 END +
      CASE WHEN [Type of product] IS NOT NULL AND [Type of product] <> '' THEN 1 ELSE 0 END +
      CASE WHEN [Price quantity] IS NOT NULL THEN 1 ELSE 0 END +
      CASE WHEN [ETIM Features] IS NOT NULL AND [ETIM Features] <> '' THEN 1 ELSE 0 END +
      CASE WHEN [ETIM] IS NOT NULL AND [ETIM] <> '' THEN 1 ELSE 0 END +
      CASE WHEN [Language] IS NOT NULL AND [Language] <> '' THEN 1 ELSE 0 END
    ) * 1.0 / 23  -- Divide by total fields, ensure float division
  ) AS CompletenessScore
  
FROM productCatalogDf
GROUP BY Manufacturername

"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

Unnamed: 0,Manufacturername,CompletenessScore
0,BOSCH,0.347826
1,FEIN,0.391304
2,FISCHER,0.391304
3,GUSTAV KLAUKE GMBH,0.304348
4,ROTHENBERGER,0.391304


In [18]:
query = """
SELECT
  AVG(CompletenessScore) AS AvgCompletenessScore
FROM (
  SELECT
    (
      (CASE WHEN EAN IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Technical details] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Depth m] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Width m] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Length m] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Weight kg] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Delivery time days] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Type of product] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Price quantity] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [ETIM Features] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [ETIM] IS NOT NULL THEN 1 ELSE 0 END) +
      (CASE WHEN [Language] IS NOT NULL THEN 1 ELSE 0 END) 
    ) * 1.0 / 16 AS CompletenessScore
  FROM productCatalogDf
) AS sub;

"""

pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

Unnamed: 0,AvgCompletenessScore
0,0.533888


Assign a simple "data quality score" per product (out of 100)

In [19]:
query = """
SELECT
    Articlenumber,
    Manufacturername,

    -- Count of valid fields
    (
        (CASE WHEN [Short description] IS NOT NULL AND TRIM([Short description]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        (CASE WHEN [Short description 2] IS NOT NULL AND TRIM([Short description 2]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        (CASE WHEN [Long description] IS NOT NULL AND TRIM([Long description]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        (CASE WHEN [Technical details] IS NOT NULL AND TRIM([Technical details]) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        (CASE WHEN EAN IS NOT NULL AND TRIM(EAN) NOT IN ('', 'N/A') THEN 1 ELSE 0 END) +
        (CASE WHEN [Picture normal reduced] IS NOT NULL AND TRIM([Picture normal reduced]) <> '' THEN 1 ELSE 0 END) +
        (CASE WHEN [Weight kg] IS NOT NULL AND [Weight kg] > 0 THEN 1 ELSE 0 END) +
        (CASE WHEN [Length m] IS NOT NULL AND [Length m] > 0 THEN 1 ELSE 0 END) +
        (CASE WHEN [Width m] IS NOT NULL AND [Width m] > 0 THEN 1 ELSE 0 END) +
        (CASE WHEN [Depth m] IS NOT NULL AND [Depth m] > 0 THEN 1 ELSE 0 END) +
        (CASE WHEN [ETIM] IS NOT NULL THEN 1 ELSE 0 END) +
        (CASE WHEN [ETIM Features] IS NOT NULL THEN 1 ELSE 0 END) +
        (CASE WHEN [Delivery time days] IS NOT NULL THEN 1 ELSE 0 END) +
        (CASE WHEN [Description_length] IS NOT NULL AND [Description_length] > 0 THEN 1 ELSE 0 END) +
        (CASE WHEN [Type of product] IS NOT NULL AND TRIM([Type of product]) <> '' THEN 1 ELSE 0 END) +
        (CASE WHEN [Price quantity] IS NOT NULL THEN 1 ELSE 0 END) +
        (CASE WHEN [Volume_m3] IS NOT NULL AND [Volume_m3] > 0 THEN 1 ELSE 0 END)
    ) * (100.0 / 19) AS data_quality_score

FROM productCatalogDf
ORDER BY data_quality_score DESC;
"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

Unnamed: 0,Articlenumber,Manufacturername,data_quality_score
0,06016C0000,BOSCH,73.684211
1,06012B4001,BOSCH,73.684211
2,06014A6200,BOSCH,73.684211
3,06014A6000,BOSCH,73.684211
4,06019H6L01,BOSCH,73.684211
...,...,...,...
303,LBOXXEKM60IDCFB,GUSTAV KLAUKE GMBH,52.631579
304,LS100FLEXCFB,GUSTAV KLAUKE GMBH,52.631579
305,71293869000,FEIN,47.368421
306,71293870000,FEIN,47.368421


Descriptive Field Interdependency

In [None]:
query = """
SELECT
  CASE
    WHEN Short_description_quality = 'bad'
         AND Short_description_2_quality = 'bad'
         AND Has_long_description = 0 THEN 'Missing All Descriptions'

    WHEN Short_description_quality = 'good'
         AND Short_description_2_quality = 'good'
         AND Has_long_description = 1 THEN 'Complete Descriptions'

    WHEN Short_description_quality = 'bad'
         AND Short_description_2_quality = 'bad'
         AND Has_long_description = 1 THEN 'Only Long Description Present'

    WHEN Short_description_quality = 'bad'
         AND Short_description_2_quality = 'good'
         AND Has_long_description = 0 THEN 'Only Short Description 2 Present'

    WHEN Short_description_quality = 'good'
         AND Short_description_2_quality = 'bad'
         AND Has_long_description = 0 THEN 'Only Short Description 1 Present'

    WHEN Short_description_quality = 'good'
         AND Short_description_2_quality = 'bad'
         AND Has_long_description = 1 THEN 'Short Description 1 + Long Present'

    WHEN Short_description_quality = 'bad'
         AND Short_description_2_quality = 'good'
         AND Has_long_description = 1 THEN 'Short Description 2 + Long Present'

    ELSE 'Other / Mixed'
  END AS description_completeness_combo,
  COUNT(*) AS product_count
FROM productCatalogDf
GROUP BY description_completeness_combo
ORDER BY product_count DESC;
"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

In [None]:
query = """
SELECT
    Manufacturername,
    COUNT(*) AS total_products,
    ROUND(AVG(CASE WHEN Short_description_quality = 'good' THEN 1.0 ELSE 0 END) * 100, 2) AS pct_good_short_desc,
    ROUND(AVG(CASE WHEN Short_description_2_quality = 'good' THEN 1.0 ELSE 0 END) * 100, 2) AS pct_good_short_desc_2,
    ROUND(AVG(Has_image) * 100, 2) AS pct_has_image,
    ROUND(AVG(Has_long_description) * 100, 2) AS pct_has_long_desc
FROM productCatalogDf
GROUP BY Manufacturername
ORDER BY pct_good_short_desc + pct_good_short_desc_2 + pct_has_image + pct_has_long_desc ASC;
"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result

In [None]:
query = """
SELECT
    Manufacturername,
    ROUND(AVG(CASE WHEN Short_description_quality = 'good' THEN 1.0 ELSE 0 END) * 100, 2) AS pct_good_short_desc,
    ROUND(AVG(CASE WHEN Short_description_2_quality = 'good' THEN 1.0 ELSE 0 END) * 100, 2) AS pct_good_short_desc_2,
    ROUND(AVG(Has_image) * 100, 2) AS pct_has_image,
    ROUND(AVG(Has_long_description) * 100, 2) AS pct_has_long_desc
FROM productCatalogDf
GROUP BY Manufacturername
ORDER BY Manufacturername;
"""
pandasql_instance = PandaSQL()
result = pandasql_instance(query, locals())
result