# Anonymize data

Steps for anonymization:
- replace "brand", "material_number", "b_code" with dummies
- rename column names

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
import nbconvert
from IPython.display import display, HTML, display_html

# import custom functions
import sys
sys.path.append('/Users/dat/Library/CloudStorage/OneDrive-foryouandyourcustomers/GitHub/AutomatedPackagingCategories_Showcase/ml_packaging_classification/src')
import utils

# set formatting
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# ignore warnings
import warnings
warnings.filterwarnings('ignore')


SEED = 42

# Clean "BCodes and Packaging Categories.xlsx"

In [2]:
df_item_bom_bcode = pd.read_excel('../../data/BCodes and Packaging Categories.xlsx')

In [3]:
df_item_bom_bcode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90035 entries, 0 to 90034
Data columns (total 15 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   2_Product Area                                          90035 non-null  object 
 1   5_Core Segment                                          90035 non-null  object 
 2   6_Brand                                                 90035 non-null  object 
 3   1_Material Number                                       90035 non-null  object 
 4   3_Material No Text                                      90035 non-null  object 
 5   component                                               90035 non-null  object 
 6   Material Description                                    90035 non-null  object 
 7   Bcode/TermCode                                          90035 non-null  int64  
 8   Characteristic Value                

In [4]:
df_item_bom_bcode.describe()

Unnamed: 0,Bcode/TermCode,Material Weight,Field31,Field14
count,90035.0,82545.0,90035.0,85671.0
mean,203562.886455,458.1777,2.67408,49411.66
std,52940.76356,29951.14,15.922722,201663.7
min,106437.0,0.0,0.0,-30145.49
25%,153664.0,0.81,0.0,1722.01
50%,203270.0,5.0,0.0,7460.691
75%,204102.0,25.7,0.0,29978.32
max,355151.0,2300000.0,328.0,10758130.0


In [5]:
df_item_bom_bcode.head()

Unnamed: 0,2_Product Area,5_Core Segment,6_Brand,1_Material Number,3_Material No Text,component,Material Description,Bcode/TermCode,Characteristic Value,Material Weight,Field31,Weight measure,12_Packaging Category Manual (Manual and Bubble chart),8_ManLoc,Field14
0,PA5,Metal Grinding,Bosch,06159975BT,Counter Display,6035765C21,Corrugated carton,153664,CORRUGATED,85.0,0.0,G,D1 - Countertop display,Distribution Center,38104.966
1,PA5,Metal Grinding,Bosch,06159975BT,Counter Display,6035940565,Label SB,204102,WOOD FREE,0.54,0.0,G,D1 - Countertop display,Distribution Center,38104.966
2,PA5,Metal Grinding,Bosch,06159975BT,Counter Display,6035822768,Tight -Pack label RB - 1ER,303917,MCB/GT2,22.9,0.0,G,D1 - Countertop display,Distribution Center,38104.966
3,PA5,Metal Grinding,Bosch,06159975BT,Counter Display,6035822768,Tight -Pack label RB - 1ER,303917,MCB/GT2,22.9,0.0,G,D1 - Countertop display,Distribution Center,38104.966
4,PA5,Metal Grinding,Bosch,06159975BT,Counter Display,6035765P54,Corrugated carton,153664,CORRUGATED,85.0,0.0,G,D1 - Countertop display,Distribution Center,38104.966


In [6]:
# anonymize "6_Brand"

# Define the mapping for categories
category_mapping = {
    'Bosch': 'BOT',
    'Vermont': 'VMT',
    'Private Label': 'PRL',
    'Private Brand': 'PRB',
    'Skil': 'SKL',
    'Neutral': 'NTA',
    'Diablo': 'DIA',
    'SIA': 'SIA',
    'Avanti': 'AVT',
    'Hawera': 'WER',
    'OEM': 'OEM',
    'FREUD': 'FED',
    '-': '-'
}

# Replace categories with predefined values
df_item_bom_bcode['6_Brand'] = df_item_bom_bcode['6_Brand'].map(category_mapping)

In [7]:
class IDAnonymizer:
    '''Replace IDs with random values. Using id_mapping dictionary to keep track of the original IDs and the new random IDs, ensuring that the same original ID gets mapped to the same random ID. The function get_random_id generates a random ID for each unique ID in the 'ID' column
    '''
    def __init__(self):
        self.id_mapping = {}

    def get_random_id(self, original_id, num_digits=4, prefix=''):
        if original_id not in self.id_mapping:
            # Ensure that the random number has the specified number of digits
            lower_bound = 10 ** (num_digits - 1)
            upper_bound = 10 ** num_digits - 1
            random_number = np.random.randint(lower_bound, upper_bound + 1)
            # Concatenate prefix with random number
            self.id_mapping[original_id] = f"{prefix}{random_number}"
        return self.id_mapping[original_id]

# anonymize "1_Material Number"
# Replace IDs with random values
id_anonymizer = IDAnonymizer()
df_item_bom_bcode['1_Material Number'] = df_item_bom_bcode['1_Material Number'].apply(lambda x: id_anonymizer.get_random_id(x, num_digits=8))

# anonymize "Bcode/TermCode"
# Replace IDs with random values
id_anonymizer = IDAnonymizer()
df_item_bom_bcode['Bcode/TermCode'] = df_item_bom_bcode['Bcode/TermCode'].apply(lambda x: id_anonymizer.get_random_id(x, num_digits=6, prefix='PCode_'))

In [8]:
df_item_bom_bcode = df_item_bom_bcode.rename(columns={
    '2_Product Area': 'Product Area',
    '5_Core Segment': 'Core Segment',
    '6_Brand': 'Brand',
    '1_Material Number': 'Material Number',
    '3_Material No Text': 'Material No Text',
    'component': 'Component',
    'Material Description': 'Material Description',
    'Bcode/TermCode': 'Packaging Code',
    'Characteristic Value': 'Material Characteristic',
    'Material Weight': 'Material Weight',
    'Field31': 'Column 21',
    'Weight measure': 'Weight measure',
    '12_Packaging Category Manual (Manual and Bubble chart)': 'Packaging Category',
    '8_ManLoc': 'Manufactoring Location',
    'Field14': 'Column 43'
})

In [9]:
df_item_bom_bcode.head()

Unnamed: 0,Product Area,Core Segment,Brand,Material Number,Material No Text,Component,Material Description,Packaging Code,Material Characteristic,Material Weight,Column 21,Weight measure,Packaging Category,Manufactoring Location,Column 43
0,PA5,Metal Grinding,BOT,45733343,Counter Display,6035765C21,Corrugated carton,PCode_664273,CORRUGATED,85.0,0.0,G,D1 - Countertop display,Distribution Center,38104.966
1,PA5,Metal Grinding,BOT,45733343,Counter Display,6035940565,Label SB,PCode_560426,WOOD FREE,0.54,0.0,G,D1 - Countertop display,Distribution Center,38104.966
2,PA5,Metal Grinding,BOT,45733343,Counter Display,6035822768,Tight -Pack label RB - 1ER,PCode_481002,MCB/GT2,22.9,0.0,G,D1 - Countertop display,Distribution Center,38104.966
3,PA5,Metal Grinding,BOT,45733343,Counter Display,6035822768,Tight -Pack label RB - 1ER,PCode_481002,MCB/GT2,22.9,0.0,G,D1 - Countertop display,Distribution Center,38104.966
4,PA5,Metal Grinding,BOT,45733343,Counter Display,6035765P54,Corrugated carton,PCode_664273,CORRUGATED,85.0,0.0,G,D1 - Countertop display,Distribution Center,38104.966


In [10]:
df_item_bom_bcode.to_csv('../../data/output/data_showcase.csv', sep='\t', index=False, header=True)