# AI Tuning: Sanitization Template

### Imports

#### Install Presidio

In [None]:
!pip install -U presidio_anonymizer
!pip install -U presidio_analyzer
!pip install -U spacy
!python -m spacy download en_core_web_lg

#### Imports propre

In [None]:
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer


from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer.entities.engine import OperatorConfig

# Raw Data Manipulation

In [None]:
# change variables per customer
FILENAME = "kroger-post-prod-utterance.csv"
IVA = 'kroger'

# import data into dataframe
data = pd.read_csv(FILENAME)

# find date range
min_date, max_date = data['timestamp'].min(), data['timestamp'].max()
print("Min date: ", min_date)
print("Max data: ", max_date)

DATE = '.'.join([
    min_date.split()[0].split('-')[1],
    min_date.split()[0].split('-')[2], 
    min_date.split()[0].split('-')[0]
])

DATE += '-' + '.'.join([
    max_date.split()[0].split('-')[1],
    max_date.split()[0].split('-')[2], 
    max_date.split()[0].split('-')[0]
])

print(DATE)


# view shape
print("Shape:", data.shape)

# save shape
original_data_rows = data.shape[0]

# view df
data.head()

## Standardization

In [None]:
# grab only columns of interest
data = data[['timestamp','Intent', 'Utterance', 'IntentConfidence']]
#data = data[['timestamp','Correct_Intent', 'Utterance_Trascription']]

# change column names to utterance and intent
data = data.rename(columns={'Utterance': 'Raw Utterance', 'Intent': 'Intent', 'timestamp': 'Timestamp', 'IntentConfidence': 'Confidence'})
#data = data.rename(columns={'Utterance_Trascription': 'Raw Utterance', 'Correct_Intent': 'Intent', 'timestamp': 'Timestamp'})

# view shape
print("Shape:", data.shape)

# view dataframe
data.head()

## Timestamp Manipulation

In [None]:
# new data frame with split value columns
new = data["Timestamp"].astype(str).str.split(" ", n = 1, expand = True)

# making separate first name column from new data frame
data["Date"]= new[0]

# drop old timestamp column
data.drop(columns =["Timestamp"], inplace = True)

# view shape
print("Shape:", data.shape)

# view dataframe
data.head()

## NAs Dropping

In [None]:
# row count before
before = data.shape[0]

# pd doesn't recognise empty strings as null, we need to convert them to NaN using numpy
data['Raw Utterance'].replace('', np.nan, inplace=True)

# drop NA's
data.dropna(subset=['Raw Utterance'], inplace=True)

# row count after
after = data.shape[0]

# difference
diff = before - after

# print results
print("# of rows before dropping NA: ", before)
print("# of NA values: ", diff)
print("# of rows after dropping NA: ", after)

# save shape
non_empty_data_rows = after

# view shape
print("Shape:", data.shape)

# view dataframe
data.head()

## Text PreProcessing

In [None]:
# lowercase the text
data['Lowercase'] = data['Raw Utterance'].astype(str).str.lower()

# rename dataframe
usable_data = data

# view shape
print("Shape:", usable_data.shape)

# view dataframe
usable_data.head()

## Text Sanitization

In [None]:
# set up the NER tagging tool
analyzer = AnalyzerEngine()

# set up anonymization tool
anonymizer = AnonymizerEngine()

# define function to apply to dataframe for anonymization
def analyze_and_anonymize_text(text):

    # dictionary assigning replacement values for each label assigned to entities
    operators = {
        "PERSON": OperatorConfig(operator_name="replace", params={"new_value": "<PERSON_NAME>"}),
        "LOCATION": OperatorConfig(operator_name="replace", params={"new_value": "<LOCATION_NAME>"}),
        "DATE_TIME": OperatorConfig(operator_name="replace", params={"new_value": "<DATE_TIME>"}),
        "CREDIT_CARD": OperatorConfig(operator_name="replace", params={"new_value": "<CREDIT_CARD>"}),
        "PHONE_NUMBER": OperatorConfig(operator_name="replace", params={"new_value": "<PHONE_NUMBER>"}),
        "EMAIL_ADDRESS": OperatorConfig(operator_name="replace", params={"new_value": "<EMAIL_ADDRESS>"})
    }

    #tagging results for Dates/Times, Location, Person, Phone Number, Credit Card
    results = analyzer.analyze(text=text, entities=['DATE_TIME','LOCATION','PERSON','PHONE_NUMBER','CREDIT_CARD', 'EMAIL_ADDRESS'],language='en')

    #anonymize text
    anonymized_text = anonymizer.anonymize(text = text, analyzer_results = results, operators = operators).text

    # return
    return anonymized_text

# call function
usable_data['Clean'] = usable_data['Lowercase'].apply(lambda x: analyze_and_anonymize_text(x))

# view shape
print("Shape:", usable_data.shape)

# view dataframe
usable_data.head()

## Digits Tagging

In [None]:
def tag_digits(text):

    # skip replacing "401k"
    if "401k" in text:
        return text

    # skip replacing "401K"
    elif "401K" in text:
        return text

    # return
    else:
        return re.sub(r'\d+', '<DIGITS>', text)

# call function
usable_data['Clean'] = usable_data['Clean'].apply(tag_digits)

# view shape
print("Shape:", usable_data.shape)

# view dataframe
usable_data.head()

## Redacted Rows Dropping

In [None]:
# row count before
before = usable_data.shape[0]

# drop redacted utterances
usable_data = usable_data[~usable_data.apply(lambda row: row.astype(str).str.contains(r'^<.*>$').any(), axis=1)]

# row count after
after = usable_data.shape[0]

# difference
diff = before - after

# print results
print("# of rows before dropping spaces: ", before)
print("# of rows containing only redacted info: ", diff)
print("# of rows after dropping spaces: ", after)

# view shape
print("Shape:", usable_data.shape)

# view dataframe
usable_data.head()

## Undefined Detection

In [None]:
# create new dataframe
data_for_analysis = usable_data.copy()

# define unnamed intents
data_for_analysis.Intent = data_for_analysis.Intent.fillna('Undefined')

# view shape
print("Shape:", data_for_analysis.shape)

# view dataframe
data_for_analysis.head()

In [None]:
# @title Check Before Export

throw an error

# Exports

## Export File to Sample for HF
### This file contains stopwords and is not lemmatized

In [None]:
# assemble filenames
filename = (IVA + "_HF_sanitised_data_to_sample_" + DATE + ".csv")

# select columns of interest
sanitized_data_hf = data_for_analysis[['Clean', 'Intent']]

# sort by intent
sanitized_data_hf = sanitized_data_hf.sort_values(by = 'Intent')

# save sanitized data
sanitized_data_hf.to_csv(filename, index = False)

# view shape
print("Shape:", sanitized_data_hf.shape)

# save shape
clean_data_rows = sanitized_data_hf.shape[0]

# view dataframe
sanitized_data_hf.head()

## Export General Sanitized Data
### This file does not contain stopwords, and has undergone lemmatization

In [None]:
# load stop words
stop_words = stopwords.words('english')

# create custom list of stop words
stop_words.extend(['PERSON_NAME', 'LOCATION_NAME', 'DATE_TIME', 'CREDIT_CARD', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'DIGITS', 'DIGITSth', 'DIGITSst', 'DIGITSrd', 'right', 'take', 'look', 'wait', 'let', 'see', 'dollars', 'zip', 'code', 'alright', 'call', 'know', 'stay', 'line', 'give', 'moment', 'west', 'thirty', 'street', 'going', 'really', 'appreciate', 'back', 'tone', 'get', 'much', 'could', 'check', 'sure', 'left', 'hang', 'press', 'please', 'leave', 'message', 'gon', 'na', 'can', 'not', 'gmail', 'com', 'finished', 'recording', 'okay', 'yeah', 'may', 'yes', 'dot', 'email','said', 'go', 'got', 'okai', 'like', 'number', 'record', 'message', 'twenty', 'hundred', 'seventy', 'forty', 'sixteen', 'sir', 'miss', 'voice', 'good', 'day', 'help', 'today', 'custom', 'care', 'thank','calling', 'hello', 'hi', 'welcome', 'thank', 'bye', 'goodbye', 'name', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten'])

In [None]:
# tokenize sentences
tokenizer = RegexpTokenizer('\w+')
data_for_analysis['Tokens'] = data_for_analysis['Clean'].apply(tokenizer.tokenize)

# remove stopwords defined above
data_for_analysis['Tokens'] = data_for_analysis['Tokens'].apply(lambda x: [item for item in x if item not in stop_words])

# convert output to a string and only keep words longer than 2 chars
data_for_analysis['Tokens'] = data_for_analysis['Tokens'].apply(lambda x: ' '.join([item for item in x if len(item) > 2]))

# lemmatize tokens
lemmatizer = WordNetLemmatizer()
data_for_analysis['Lemmatized'] = data_for_analysis['Tokens'].apply(lemmatizer.lemmatize)
data_for_analysis.head()

In [None]:
# assemble filenames
filename = (IVA + "_sanitized_data_for_EDA_" + DATE + ".csv")

# select columns of interest
sanitized_data = data_for_analysis[['Lemmatized', 'Intent', 'Date', 'Confidence']]
#sanitized_data = data_for_analysis[['Clean', 'Intent', 'Date', 'Lemmatized']]

# create a deep copy to avoid warnings
sanitized_data = sanitized_data.copy()

# drop empty rows created as a result of lemmatization
# pd doesn't recognise empty strings as null, we need to convert them to NaN using numpy
sanitized_data['Lemmatized'].replace('', np.nan, inplace=True)

# drop NA's
sanitized_data.dropna(subset=['Lemmatized'], inplace=True)

# row count after
after_lemmatization = sanitized_data.shape[0]
print("# of rows after dropping NA: ", after_lemmatization)

# rename columns
sanitized_data = sanitized_data.rename(columns = {'Lemmatized':'Utterance'})

# sort by intent
sanitized_data = sanitized_data.sort_values(by = 'Intent')

# save sanitized data
sanitized_data.to_csv(filename, index = False)

# view shape
print("Shape:", sanitized_data.shape)

# save shape
clean_data_rows = sanitized_data.shape[0]

# view dataframe
sanitized_data.head()

# Metrics

In [None]:
# print basic metrics
print(f'Number of rows in original file: {original_data_rows}.')
print(f'Number of rows containing an utterance: {non_empty_data_rows}.')
print(f'Number of empty rows dropped: {original_data_rows - non_empty_data_rows}.')
print(f'Number of rows in pre-processed file: {after_lemmatization}.')
print(f'Number of rows dropped as a result of the cleaning process: {non_empty_data_rows - after_lemmatization}.')