In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Upload Product Data From CSV

In [95]:
#Load Productions Data 
Productions= pd.read_csv ('PRODUCTS_TAKEHOME (1).csv')
Productions['BARCODE'] = Productions['BARCODE'].replace(['', ' '], np.nan)
Productions.head()

Unnamed: 0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,MANUFACTURER,BRAND,BARCODE
0,Health & Wellness,Sexual Health,Conductivity Gels & Lotions,,,,796494400000.0
1,Snacks,Puffed Snacks,Cheese Curls & Puffs,,,,23278010000.0
2,Health & Wellness,Hair Care,Hair Care Accessories,,PLACEHOLDER MANUFACTURER,ELECSOP,461817800000.0
3,Health & Wellness,Oral Care,Toothpaste,,COLGATE-PALMOLIVE,COLGATE,35000470000.0
4,Health & Wellness,Medicines & Treatments,Essential Oils,,MAPLE HOLISTICS AND HONEYDEW PRODUCTS INTERCHA...,MAPLE HOLISTICS,806810900000.0


### Data Exploration and Cleaning

In [124]:
Productions.shape

# Get overview
Productions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845552 entries, 0 to 845551
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   CATEGORY_1    845441 non-null  object 
 1   CATEGORY_2    844128 non-null  object 
 2   CATEGORY_3    784986 non-null  object 
 3   CATEGORY_4    67459 non-null   object 
 4   MANUFACTURER  619078 non-null  object 
 5   BRAND         619080 non-null  object 
 6   BARCODE       841527 non-null  float64
dtypes: float64(1), object(6)
memory usage: 45.2+ MB


**Calculate missing value**

In [129]:
# Check missing values
print(Productions.isnull().sum())

# Calculate percentage of missing values
total_rows = len(Productions)  
missing_count = Productions.isnull().sum() 
missing_percentage = round((missing_count / total_rows) * 100 ,2) 

# Output the result
print(result)

CATEGORY_1         111
CATEGORY_2        1424
CATEGORY_3       60566
CATEGORY_4      778093
MANUFACTURER    226474
BRAND           226472
BARCODE           4025
dtype: int64
CATEGORY_1       0.01
CATEGORY_2       0.17
CATEGORY_3       7.16
CATEGORY_4      92.02
MANUFACTURER    26.78
BRAND           26.78
BARCODE          0.48
dtype: float64%


In [106]:
Productions.nunique()

CATEGORY_1          27
CATEGORY_2         121
CATEGORY_3         344
CATEGORY_4         127
MANUFACTURER      4354
BRAND             8122
BARCODE         841342
dtype: int64

In [27]:
# Display all duplicate rows
duplicated_rows = Productions[Productions.duplicated()].sort_values(by = ['CATEGORY_1', 'CATEGORY_2','CATEGORY_3','CATEGORY_4','BARCODE'])
print(f"Number of duplicate rows: {len(duplicated_rows)}")
print(duplicated_rows)

Number of duplicate rows: 215
               CATEGORY_1              CATEGORY_2  \
359328            Alcohol                    Beer   
817261          Beverages  Carbonated Soft Drinks   
410161  Health & Wellness  Medicines & Treatments   
764322  Health & Wellness  Medicines & Treatments   
443703  Health & Wellness  Medicines & Treatments   
...                   ...                     ...   
474902             Snacks             Snack Cakes   
483929             Snacks             Snack Cakes   
747596             Snacks             Snack Cakes   
768066             Snacks             Snack Cakes   
553091             Snacks             Snack Cakes   

                                    CATEGORY_3      CATEGORY_4  \
359328                                   Lager  American Lager   
817261                                    Cola    Regular Cola   
410161  Allergy & Sinus Medicines & Treatments             NaN   
764322  Allergy & Sinus Medicines & Treatments             NaN   
443

### Data Quality & Issues

**1. Check for Categorical Data**

In [58]:
# Categories
print(Productions['CATEGORY_1'].unique())
print(Productions['CATEGORY_2'].unique())
#print(Productions['CATEGORY_3'].unique())
#print(Productions['CATEGORY_4'].unique())

['Health & Wellness' 'Snacks' 'Beverages' 'Pantry' 'Alcohol'
 'Apparel & Accessories' 'Restaurant' 'Needs Review' 'Dairy'
 'Home & Garden' nan 'Household Supplies' 'Meat & Seafood' 'Deli & Bakery'
 'Sporting Goods' 'Produce' 'Office & School' 'Frozen'
 'Arts & Entertainment' 'Animals & Pet Supplies' 'Electronics' 'Beauty'
 'Toys & Games' 'Mature' 'Vehicles & Parts' 'Baby & Toddler'
 'Luggage & Bags' 'Media']
['Sexual Health' 'Puffed Snacks' 'Hair Care' 'Oral Care'
 'Medicines & Treatments' 'Deodorant & Antiperspirant' 'Snack Bars' nan
 'Bath & Body' 'Nuts & Seeds' 'Candy' 'Cookies' 'Variety Snack Packs'
 'Hair Removal' 'Medical Supplies & Equipment' 'Chips' 'Snack Cakes'
 'Skin Care' 'Dessert Toppings' 'Eye Care' 'Fruit & Vegetable Snacks'
 'Snack Mixes' 'Crackers' 'Jerky & Dried Meat'
 'Topical Muscle & Joint Relief Treatments' 'Foot Care' 'First Aid'
 'Ear Care' 'Menstrual Care' 'Pretzels' 'Trail Mix' 'Dips & Salsa'
 'Adult Incontinence' 'Water' 'Cereal, Granola, & Toaster Pastries' 

In [47]:
# Manufacturer
Productions['MANUFACTURER'].unique()
#placeholder_manufacturer = Productions[Productions['MANUFACTURER'] == 'PLACEHOLDER MANUFACTURER']
#print(f"Rows with placeholder manufacturer:\n{placeholder_manufacturer}")

array([nan, 'PLACEHOLDER MANUFACTURER', 'COLGATE-PALMOLIVE', ...,
       'VIDETTE INC', 'SCRUB-IT', 'OUTDOOR PRODUCT INNOVATIONS, INC.'],
      dtype=object)

It is noted that "PLACEHOLDER MANUFACTURER" appears to be a placeholder value.   
Confirm that values starting with a number (e.g., "5.11") correspond to valid manufacturers, as "5.11" is associated with the brand "5.11 TACTICAL."

**2.Convert scientific notion to string for Barcode**

In [53]:
Productions['BARCODE'] = Productions['BARCODE'].astype(str)
Productions.head()

Unnamed: 0,CATEGORY_1,CATEGORY_2,CATEGORY_3,CATEGORY_4,MANUFACTURER,BRAND,BARCODE
0,Health & Wellness,Sexual Health,Conductivity Gels & Lotions,,,,796494407820.0
1,Snacks,Puffed Snacks,Cheese Curls & Puffs,,,,23278011028.0
2,Health & Wellness,Hair Care,Hair Care Accessories,,PLACEHOLDER MANUFACTURER,ELECSOP,461817824225.0
3,Health & Wellness,Oral Care,Toothpaste,,COLGATE-PALMOLIVE,COLGATE,35000466815.0
4,Health & Wellness,Medicines & Treatments,Essential Oils,,MAPLE HOLISTICS AND HONEYDEW PRODUCTS INTERCHA...,MAPLE HOLISTICS,806810850459.0
