In [1]:
# Add all imports related to data engineering
import json
import numpy as np
import pandas as pd
import string

In [2]:
# Import the training json file, then close the file
file = open("raw_data/train.json")
raw_data = json.load(file)
file.close()

In [3]:
print(raw_data[0].keys())
print(raw_data[0]["tokens"])

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])
['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-', 'Nathalie', 'Sylla', '\n\n', 'Challenge', '&', 'selection', '\n\n', 'The', 'tool', 'I', 'use', 'to', 'help', 'all', 'stakeholders', 'finding', 'their', 'way', 'through', 'the', 'complexity', 'of', 'a', 'project', 'is', 'the', ' ', 'mind', 'map', '.', '\n\n', 'What', 'exactly', 'is', 'a', 'mind', 'map', '?', 'According', 'to', 'the', 'definition', 'of', 'Buzan', 'T.', 'and', 'Buzan', 'B.', '(', '1999', ',', 'Dessine', '-', 'moi', ' ', "l'intelligence", '.', 'Paris', ':', 'Les', 'Éditions', "d'Organisation", '.', ')', ',', 'the', 'mind', 'map', '(', 'or', 'heuristic', 'diagram', ')', 'is', 'a', 'graphic', ' ', 'representation', 'technique', 'that', 'follows', 'the', 'natural', 'functioning', 'of', 'the', 'mind', 'and', 'allows', 'the', 'brain', "'s", ' ', 'potential', 'to', 'be', 'released', '.', 'Cf', 'Annex1', '\n\n', 'This'

In [4]:
pii_number_encoding = {
    'B-EMAIL': 0, 
    'B-ID_NUM': 1, 
    'B-NAME_STUDENT': 2, 
    'B-PHONE_NUM': 3, 
    'B-STREET_ADDRESS': 4, 
    'B-URL_PERSONAL': 5, 
    'B-USERNAME': 6, 
    'I-NAME_STUDENT': 7,  
    'I-PHONE_NUM': 8, 
    'I-STREET_ADDRESS': 9,  
    'I-URL_PERSONAL': 10, 
    'O': 11
}

In [5]:
# Got the txt file from here: https://gist.github.com/deekayen/4148741

# Common words List to hold common words
with open("ml-data-input/most-common-words.txt", "r") as common_words_file:
    common_tokens = [word[:-1] for word in list(common_words_file)]
    common_tokens.append("\n\n")
    common_tokens.append("\n")
    common_tokens.append(" ")

# Add punctuation to the list of commonalities
for char in list(string.punctuation):
    common_tokens.append(char)

print(common_tokens)

['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it', 'you', 'that', 'he', 'was', 'for', 'on', 'are', 'with', 'as', 'I', 'his', 'they', 'be', 'at', 'one', 'have', 'this', 'from', 'or', 'had', 'by', 'not', 'word', 'but', 'what', 'some', 'we', 'can', 'out', 'other', 'were', 'all', 'there', 'when', 'up', 'use', 'your', 'how', 'said', 'an', 'each', 'she', 'which', 'do', 'their', 'time', 'if', 'will', 'way', 'about', 'many', 'then', 'them', 'write', 'would', 'like', 'so', 'these', 'her', 'long', 'make', 'thing', 'see', 'him', 'two', 'has', 'look', 'more', 'day', 'could', 'go', 'come', 'did', 'number', 'sound', 'no', 'most', 'people', 'my', 'over', 'know', 'water', 'than', 'call', 'first', 'who', 'may', 'down', 'side', 'been', 'now', 'find', 'any', 'new', 'work', 'part', 'take', 'get', 'place', 'made', 'live', 'where', 'after', 'back', 'little', 'only', 'round', 'man', 'year', 'came', 'show', 'every', 'good', 'me', 'give', 'our', 'under', 'name', 'very', 'through', 'just', 'form', 'sentence', 'g

In [6]:
# A function to get rid of the 1000 most common used words
def common_word_drop(token_list, whitespace_list, label_list, rows):
    for word in common_tokens:
        if word in token_list:
            indices = [i for i, x in enumerate(token_list) if x.lower() == word]
            token_list = [token_list[i] for i in range(len(token_list)) if i not in indices]
            whitespace_list = [whitespace_list[i] for i in range(len(whitespace_list)) if i not in indices]
            label_list = [label_list[i] for i in range(len(label_list)) if i not in indices]
            rows = [rows[i] for i in range(len(rows)) if i not in indices]

    return token_list, whitespace_list, label_list, rows

In [7]:
def get_rows(full_tokens):
    rows = []
    row_num = 1
    for token in full_tokens:
        rows.append(row_num)
        if token == "\n\n" or token == "\n":
            row_num = row_num + 1
    return rows

In [8]:
def engineer_data_for_model(data):

    # Get the first value from the data
    first_doc = data[0]

    # Get rid of common words
    all_rows = get_rows(first_doc['tokens'])
    tokens, white_spaces, labels, rows = common_word_drop(first_doc['tokens'], first_doc['trailing_whitespace'], first_doc['labels'], all_rows)

    # Create the initial dataframe from the above data
    first_doc_data = {
        "tokens": tokens,
        "trailing_whitespaces": white_spaces,
        "capitalized first char": [True if label[0].isupper() else False for label in tokens],
        "token length": [len(token) for token in tokens],
        "is_numeric": [True if token.isnumeric() else False for token in tokens],
        "PII label": [pii_number_encoding[label] for label in labels],
        "Row": rows
    }
    raw_df = pd.DataFrame(first_doc_data)

    # Loop till the end of the data
    for document in data[1: len(data) - 1]:

        # Get rid of common words
        all_rows = get_rows(document['tokens'])
        tokens, white_spaces, labels, rows = common_word_drop(document['tokens'], document['trailing_whitespace'], document['labels'], all_rows)

        # Collect the data in the same way
        doc_data = {
            "tokens": tokens,
            "trailing_whitespaces": white_spaces,
            "capitalized first char": [True if label[0].isupper() else False for label in tokens],
            "token length": [len(token) for token in tokens],
            "is_numeric": [True if token.isnumeric() else False for token in tokens],
            "PII label": [pii_number_encoding[label] for label in labels],
            "Row": rows
        }
        df = pd.DataFrame(doc_data)

        # Concatenate all the data into one single dataframe
        raw_df = pd.concat([raw_df, df], ignore_index=True, sort=False)

    # Return the concatenated dataframe
    return raw_df


In [9]:
# Get the training data and get rid of some unneeded number values
data = engineer_data_for_model(raw_data[0:100])

Unnamed: 0,tokens,trailing_whitespaces,capitalized first char,token length,is_numeric,PII label,Row
0,Thinking,True,True,8,False,11,1
1,innovation,True,False,10,False,11,1
2,reflexion,False,False,9,False,11,1
3,Avril,True,True,5,False,11,1
4,2021,False,False,4,True,11,1
...,...,...,...,...,...,...,...
28295,reality,False,False,7,False,11,70
28296,because,True,False,7,False,11,70
28297,already,True,False,7,False,11,71
28298,creating,True,False,8,False,11,71


In [10]:
# Show the data
data.head(20)

Unnamed: 0,tokens,trailing_whitespaces,capitalized first char,token length,is_numeric,PII label,Row
0,Thinking,True,True,8,False,11,1
1,innovation,True,False,10,False,11,1
2,reflexion,False,False,9,False,11,1
3,Avril,True,True,5,False,11,1
4,2021,False,False,4,True,11,1
5,Nathalie,True,True,8,False,2,1
6,Sylla,False,True,5,False,7,1
7,Challenge,True,True,9,False,11,2
8,selection,False,False,9,False,11,2
9,I,True,True,1,False,11,3
