In [1]:
# Add all imports related to data engineering
import json
import numpy as np
import pandas as pd
import string

In [2]:
# Import the training json file, then close the file
file = open("raw_data/train.json")
raw_data = json.load(file)
file.close()

In [4]:
pii_number_encoding = {
    'B-NAME_STUDENT': 1, 
    'I-NAME_STUDENT': 2, 
    'B-URL_PERSONAL': 3, 
    'B-EMAIL': 4, 
    'B-ID_NUM': 5, 
    'I-URL_PERSONAL': 6, 
    'B-USERNAME': 7, 
    'I-PHONE_NUM': 8,  
    'B-STREET_ADDRESS': 9, 
    'I-STREET_ADDRESS': 10,  
    'B-PHONE_NUM': 11, 
    'I-ID_NUM': 12,
    'O': 13
}

In [5]:
# Got the txt file from here: https://gist.github.com/deekayen/4148741

# Common words List to hold common words
with open("ml-data-input/most-common-words.txt", "r") as common_words_file:
    common_tokens = [word[:-1] for word in list(common_words_file)]
    common_tokens.append("\n\n")
    common_tokens.append("\n")
    common_tokens.append(" ")

# Add punctuation to the list of commonalities
for char in list(string.punctuation):
    common_tokens.append(char)

print(common_tokens)

['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it', 'you', 'that', 'he', 'was', 'for', 'on', 'are', 'with', 'as', 'I', 'his', 'they', 'be', 'at', 'one', 'have', 'this', 'from', 'or', 'had', 'by', 'not', 'word', 'but', 'what', 'some', 'we', 'can', 'out', 'other', 'were', 'all', 'there', 'when', 'up', 'use', 'your', 'how', 'said', 'an', 'each', 'she', 'which', 'do', 'their', 'time', 'if', 'will', 'way', 'about', 'many', 'then', 'them', 'write', 'would', 'like', 'so', 'these', 'her', 'long', 'make', 'thing', 'see', 'him', 'two', 'has', 'look', 'more', 'day', 'could', 'go', 'come', 'did', 'number', 'sound', 'no', 'most', 'people', 'my', 'over', 'know', 'water', 'than', 'call', 'first', 'who', 'may', 'down', 'side', 'been', 'now', 'find', 'any', 'new', 'work', 'part', 'take', 'get', 'place', 'made', 'live', 'where', 'after', 'back', 'little', 'only', 'round', 'man', 'year', 'came', 'show', 'every', 'good', 'me', 'give', 'our', 'under', 'name', 'very', 'through', 'just', 'form', 'sentence', 'g

In [6]:
# A function to get rid of the 1000 most common used words
def common_word_drop(token_list, whitespace_list, label_list, rows):
    for word in common_tokens:
        if word in token_list:
            indices = [i for i, x in enumerate(token_list) if x.lower() == word]
            token_list = [token_list[i] for i in range(len(token_list)) if i not in indices]
            whitespace_list = [whitespace_list[i] for i in range(len(whitespace_list)) if i not in indices]
            label_list = [label_list[i] for i in range(len(label_list)) if i not in indices]
            rows = [rows[i] for i in range(len(rows)) if i not in indices]

    return token_list, whitespace_list, label_list, rows

In [7]:
def get_rows(full_tokens):
    rows = []
    row_num = 1
    for token in full_tokens:
        rows.append(row_num)
        if token == "\n\n" or token == "\n":
            row_num = row_num + 1
    return rows

In [8]:
def pii_data_exists(labels):
    # Loop over and see if a PII data is found, if it is, return True, else False.
    for label in labels:
        if label != 'O':
            return True
    return False

In [9]:
def get_closest_label(labels):

    # Initialize the indexer and for all labels, find the indexes that have PII data
    label_indexes = []
    for i in range(len(labels)):
        if labels[i] != 'O':
            label_indexes.append(i)
    
    # If there is no PII data, if only one PII data, else multiple data
    if len(label_indexes) == 0:
        return [-1 for label in labels]
    elif len(label_indexes) == 1:
        label_range = [-1 for label in labels]
        label_range[label_indexes[0]] = 0
        return label_range
    else:
        label_range = [-1 for label in labels]

        # Get distance for first PII
        first_index = label_indexes[0]
        first_pii_distance = label_indexes[1] - label_indexes[0]
        label_range[first_index] = first_pii_distance

        # Get distance for last PII
        last_index = label_indexes[-1]
        last_pii_distance = label_indexes[-1] - label_indexes[-2]
        label_range[last_index] = last_pii_distance
        
        # Loop over the second to second last PII and get the distances.
        for idx in range(1, len(label_indexes)-1):

            # For the middle PII data points. Set the previous and next PII
            current_pii = label_indexes[idx]
            previous_pii = label_indexes[idx-1]
            next_pii = label_indexes[idx+1]

            # distances
            prev_dist = current_pii - previous_pii
            next_dist = next_pii - current_pii

            # Append the shortest distance to the current pii data
            label_range[current_pii] = min(prev_dist, next_dist)
        
        # Return the label range.
        return label_range

In [10]:
def engineer_data_for_model(data):

    # Get the first value from the data
    first_doc = data[0]

    # Get rid of common words
    all_rows = get_rows(first_doc['tokens'])
    tokens, white_spaces, labels, rows = common_word_drop(first_doc['tokens'], first_doc['trailing_whitespace'], first_doc['labels'], all_rows)
    closest_labels = get_closest_label(labels)

    # Create the initial dataframe from the above data
    first_doc_data = {
        "tokens": tokens,
        "trailing_whitespaces": white_spaces,
        "capitalized first char": [True if label[0].isupper() else False for label in tokens],
        "token length": [len(token) for token in tokens],
        "is_numeric": [True if token.isnumeric() else False for token in tokens],
        "PII label": [pii_number_encoding[label] for label in labels],
        "Row": rows,
        "Closest PII data": closest_labels
    }
    raw_df = pd.DataFrame(first_doc_data)

    # Loop till the end of the data
    for document in data[1: len(data) - 1]:

        # Check to see if there exists PII data
        if not pii_data_exists(document['labels']):
            continue
            
        # Get rid of common words
        all_rows = get_rows(document['tokens'])
        tokens, white_spaces, labels, rows = common_word_drop(document['tokens'], document['trailing_whitespace'], document['labels'], all_rows)
        closest_labels = get_closest_label(labels)

        # Collect the data in the same way
        doc_data = {
            "tokens": tokens,
            "trailing_whitespaces": white_spaces,
            "capitalized first char": [True if label[0].isupper() else False for label in tokens],
            "token length": [len(token) for token in tokens],
            "is_numeric": [True if token.isnumeric() else False for token in tokens],
            "PII label": [pii_number_encoding[label] for label in labels],
            "Row": rows,
            "Closest PII data": closest_labels
        }
        df = pd.DataFrame(doc_data)

        # Concatenate all the data into one single dataframe
        raw_df = pd.concat([raw_df, df], ignore_index=True, sort=False)

    # Return the concatenated dataframe
    return raw_df


In [11]:
# Get the training data and get rid of some unneeded number values
# data = engineer_data_for_model(raw_data[0:6000])
data = engineer_data_for_model(raw_data)
data

Unnamed: 0,tokens,trailing_whitespaces,capitalized first char,token length,is_numeric,PII label,Row,Closest PII data
0,Thinking,True,True,8,False,O,1,-1
1,innovation,True,False,10,False,O,1,-1
2,reflexion,False,False,9,False,O,1,-1
3,Avril,True,True,5,False,O,1,-1
4,2021,False,False,4,True,O,1,-1
...,...,...,...,...,...,...,...,...
276038,However,False,True,7,False,O,32,-1
276039,hindrance,True,False,9,False,O,32,-1
276040,stimulate,True,False,9,False,O,32,-1
276041,innovative,False,False,10,False,O,32,-1
