In [1]:
import pandas as pd

# Read excel containing the original data to be masked

original = pd.read_excel("consolidated2018to2020.xlsx")  

# Masking Staff ID to 'X', where X = index+1 of list containing unique original Staff IDs

In [2]:
original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59170 entries, 0 to 59169
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Staff No.    59170 non-null  int64         
 1   Staff        59170 non-null  object        
 2   Designation  59170 non-null  object        
 3   Branch       59170 non-null  object        
 4   Division     59170 non-null  object        
 5   Department   59170 non-null  object        
 6   Category     59170 non-null  object        
 7   Type         59170 non-null  object        
 8   Course Name  59170 non-null  object        
 9   Credit Days  59170 non-null  float64       
 10  End Date     59170 non-null  datetime64[ns]
 11  Picked       59170 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(9)
memory usage: 5.4+ MB


In [3]:
#Creating list of unique original staff IDs

unique_id = original['Staff No.'].unique().tolist()

len(unique_id) #check number of unique staff

996

In [4]:
# Note: original staff sample = 1000, but unique no of staff from 3 year consolidated training data = 996.
# This is due to 4 staff had never attended any training in past 3 years

In [6]:
def mask_id(x):
    return unique_id.index(x)+1

original['masked_id'] = original['Staff No.'].apply(mask_id)

# Masking Staff Name to 'nameX', where X = masked_id

In [8]:
def mask_name(x):
    return 'name'+str(x)

original['masked_name'] = original['masked_id'].apply(mask_name)

# Masking actual designations with general category of designation

In [11]:
# define a dictionary of actual designations and the corresponding general category = Mgr, Exc, or Ctc

job_category = {
    'BKNG SERVICES MGR' : 'Mgr',
    'CLERK/TYPIST/CASHIER' : 'Ctc',
    'BRANCH MANAGER' : 'Mgr',
    'SENIOR EXECUTIVE' : 'Exc',
    'EXECUTIVE' : 'Exc',
    'RELIEF SENIOR EXECUTIVE' : 'Exc',
    'BUSINESS MGR' : 'Mgr',
    'SR FINANCIAL EXECUTIVE (UT)' : 'Exc',
    'SENIOR EXECUTIVE (COMPLIANCE)' : 'Exc',
    'ASSISTANT MANAGER' : 'Exc',
    'SR S&M EXE' : 'Exc',
    'EXECUTIVE (CSU)' : 'Exc',
    'RELIEF ASSISTANT MANAGER' : 'Exc',
    'ASST SALES & MKTG MGR' : 'Exc',
    'DBM, BSM, DEPUTY BSM' : 'Mgr',
    'BM' : 'Mgr',
    'BIZM' : 'Mgr',
    'AM' : 'Exc'
}

In [12]:
def mask_designation(x):
    return job_category[x]

original['masked_designation'] = original['Designation'].apply(mask_designation)

# Masking actual base with BR = Branch or HO = Head Office

In [15]:
def branch_or_ho(x):
    if x == "HEAD OFFICE":
        return "HO"
    else:
        return "BR"
    
original['BR_or_HO'] = original['Branch'].apply(branch_or_ho)

# Converting all course names to course codes X, where X = index+1 of list containing unique course names

In [18]:
#Creating list of unique course names

unique_course = original['Course Name'].unique().tolist()

len(unique_course)

966

In [19]:
def assign_course_code(x):
    return unique_course.index(x)+1

original['course_code'] = original['Course Name'].apply(assign_course_code)

# Create new dataframe with masked info only

In [22]:
df_masked = original[['masked_id','masked_name','masked_designation','BR_or_HO','course_code','Category', 'Type', 'Credit Days', 'End Date' ]]

In [23]:
original = None
df_masked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59170 entries, 0 to 59169
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   masked_id           59170 non-null  int64         
 1   masked_name         59170 non-null  object        
 2   masked_designation  59170 non-null  object        
 3   BR_or_HO            59170 non-null  object        
 4   course_code         59170 non-null  int64         
 5   Category            59170 non-null  object        
 6   Type                59170 non-null  object        
 7   Credit Days         59170 non-null  float64       
 8   End Date            59170 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 4.1+ MB


In [29]:
df_masked['masked_designation'] = df_masked['masked_designation'].astype('category')
df_masked['BR_or_HO'] = df_masked['BR_or_HO'].astype('category')
df_masked['Category'] = df_masked['Category'].astype('category')
df_masked['Type'] = df_masked['Type'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_masked['masked_designation'] = df_masked['masked_designation'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_masked['BR_or_HO'] = df_masked['BR_or_HO'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_masked['Category'] = df_masked['Category'].astype('c

In [30]:
df_masked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59170 entries, 0 to 59169
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   masked_id           59170 non-null  int64         
 1   masked_name         59170 non-null  object        
 2   masked_designation  59170 non-null  category      
 3   BR_or_HO            59170 non-null  category      
 4   course_code         59170 non-null  int64         
 5   Category            59170 non-null  category      
 6   Type                59170 non-null  category      
 7   Credit Days         59170 non-null  float64       
 8   End Date            59170 non-null  datetime64[ns]
dtypes: category(4), datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 2.5+ MB


# Add Columns Year & Month as extracted from End Date

In [33]:
df_masked['Year'] = df_masked['End Date'].dt.year

In [34]:
df_masked['Month'] = df_masked['End Date'].dt.month

In [35]:
df_masked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59170 entries, 0 to 59169
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   masked_id           59170 non-null  int64         
 1   masked_name         59170 non-null  object        
 2   masked_designation  59170 non-null  category      
 3   BR_or_HO            59170 non-null  category      
 4   course_code         59170 non-null  int64         
 5   Category            59170 non-null  category      
 6   Type                59170 non-null  category      
 7   Credit Days         59170 non-null  float64       
 8   End Date            59170 non-null  datetime64[ns]
 9   Year                59170 non-null  int64         
 10  Month               59170 non-null  int64         
dtypes: category(4), datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 3.4+ MB


# Export masked dataframe to new CSV file

In [39]:
df_masked.to_csv("maskedconso.csv", index=False)

# The file maskedconso.csv will be used in Part 2 of the project