#### Data Preprocessing

In [1]:
# Add all imports related to data engineering
import json
import numpy as np
import pandas as pd
import string

In [2]:
# Import the training json file, then close the file
file = open("train.json")
raw_data = json.load(file)
file.close()

In [3]:
pii_number_encoding = {
    'B-NAME_STUDENT': 0, 
    'I-NAME_STUDENT': 1, 
    'B-URL_PERSONAL': 2, 
    'B-EMAIL': 3, 
    'B-ID_NUM': 4, 
    'I-URL_PERSONAL': 5, 
    'B-USERNAME': 6, 
    'I-PHONE_NUM': 7,  
    'B-STREET_ADDRESS': 8, 
    'I-STREET_ADDRESS': 9,  
    'B-PHONE_NUM': 10, 
    'I-ID_NUM': 11,
    'O': 12
}

In [4]:
# Got the txt file from here: https://gist.github.com/deekayen/4148741

# Common words List to hold common words
with open("most-common-words.txt", "r") as common_words_file:
    common_tokens = [word[:-1] for word in list(common_words_file)]
    common_tokens.append("\n\n")
    common_tokens.append("\n")
    common_tokens.append(" ")

# Add punctuation to the list of commonalities
for char in list(string.punctuation):
    common_tokens.append(char)

print(common_tokens)

['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it', 'you', 'that', 'he', 'was', 'for', 'on', 'are', 'with', 'as', 'I', 'his', 'they', 'be', 'at', 'one', 'have', 'this', 'from', 'or', 'had', 'by', 'not', 'word', 'but', 'what', 'some', 'we', 'can', 'out', 'other', 'were', 'all', 'there', 'when', 'up', 'use', 'your', 'how', 'said', 'an', 'each', 'she', 'which', 'do', 'their', 'time', 'if', 'will', 'way', 'about', 'many', 'then', 'them', 'write', 'would', 'like', 'so', 'these', 'her', 'long', 'make', 'thing', 'see', 'him', 'two', 'has', 'look', 'more', 'day', 'could', 'go', 'come', 'did', 'number', 'sound', 'no', 'most', 'people', 'my', 'over', 'know', 'water', 'than', 'call', 'first', 'who', 'may', 'down', 'side', 'been', 'now', 'find', 'any', 'new', 'work', 'part', 'take', 'get', 'place', 'made', 'live', 'where', 'after', 'back', 'little', 'only', 'round', 'man', 'year', 'came', 'show', 'every', 'good', 'me', 'give', 'our', 'under', 'name', 'very', 'through', 'just', 'form', 'sentence', 'g

In [5]:
# A function to get rid of the 1000 most common used words
def common_word_drop(token_list, whitespace_list, label_list, rows):
    for word in common_tokens:
        if word in token_list:
            indices = [i for i, x in enumerate(token_list) if x.lower() == word]
            token_list = [token_list[i] for i in range(len(token_list)) if i not in indices]
            whitespace_list = [whitespace_list[i] for i in range(len(whitespace_list)) if i not in indices]
            label_list = [label_list[i] for i in range(len(label_list)) if i not in indices]
            rows = [rows[i] for i in range(len(rows)) if i not in indices]

    return token_list, whitespace_list, label_list, rows

In [6]:
def get_rows(full_tokens):
    rows = []
    row_num = 1
    for token in full_tokens:
        rows.append(row_num)
        if token == "\n\n" or token == "\n":
            row_num = row_num + 1
    return rows

In [7]:
def pii_data_exists(labels):
    # Loop over and see if a PII data is found, if it is, return True, else False.
    for label in labels:
        if label != 'O':
            return True
    return False

In [8]:
def get_closest_label(labels):

    # Initialize the indexer and for all labels, find the indexes that have PII data
    label_indexes = []
    for i in range(len(labels)):
        if labels[i] != 'O':
            label_indexes.append(i)
    
    # If there is no PII data, if only one PII data, else multiple data
    if len(label_indexes) == 0:
        return [-1 for label in labels]
    elif len(label_indexes) == 1:
        label_range = [-1 for label in labels]
        label_range[label_indexes[0]] = 0
        return label_range
    else:
        label_range = [-1 for label in labels]

        # Get distance for first PII
        first_index = label_indexes[0]
        first_pii_distance = label_indexes[1] - label_indexes[0]
        label_range[first_index] = first_pii_distance

        # Get distance for last PII
        last_index = label_indexes[-1]
        last_pii_distance = label_indexes[-1] - label_indexes[-2]
        label_range[last_index] = last_pii_distance
        
        # Loop over the second to second last PII and get the distances.
        for idx in range(1, len(label_indexes)-1):

            # For the middle PII data points. Set the previous and next PII
            current_pii = label_indexes[idx]
            previous_pii = label_indexes[idx-1]
            next_pii = label_indexes[idx+1]

            # distances
            prev_dist = current_pii - previous_pii
            next_dist = next_pii - current_pii

            # Append the shortest distance to the current pii data
            label_range[current_pii] = min(prev_dist, next_dist)
        
        # Return the label range.
        return label_range

In [9]:
def engineer_data_for_model(data):

    # Get the first value from the data
    first_doc = data[0]

    # Get rid of common words
    all_rows = get_rows(first_doc['tokens'])
    tokens, white_spaces, labels, rows = common_word_drop(first_doc['tokens'], first_doc['trailing_whitespace'], first_doc['labels'], all_rows)
    closest_labels = get_closest_label(labels)

    # Create the initial dataframe from the above data
    first_doc_data = {
        "tokens": tokens,
        "trailing_whitespaces": white_spaces,
        "capitalized first char": [True if label[0].isupper() else False for label in tokens],
        "token length": [len(token) for token in tokens],
        "is_numeric": [True if token.isnumeric() else False for token in tokens],
        "PII label": labels,
        "Row": rows,
        "Closest PII data": closest_labels
    }
    raw_df = pd.DataFrame(first_doc_data)

    # Loop till the end of the data
    for document in data[1: len(data) - 1]:

        # Check to see if there exists PII data
        if not pii_data_exists(document['labels']):
            continue
            
        # Get rid of common words
        all_rows = get_rows(document['tokens'])
        tokens, white_spaces, labels, rows = common_word_drop(document['tokens'], document['trailing_whitespace'], document['labels'], all_rows)
        closest_labels = get_closest_label(labels)

        # Collect the data in the same way
        doc_data = {
            "tokens": tokens,
            "trailing_whitespaces": white_spaces,
            "capitalized first char": [True if label[0].isupper() else False for label in tokens],
            "token length": [len(token) for token in tokens],
            "is_numeric": [True if token.isnumeric() else False for token in tokens],
            "PII label": labels,
            "Row": rows,
            "Closest PII data": closest_labels
        }
        df = pd.DataFrame(doc_data)

        # Concatenate all the data into one single dataframe
        raw_df = pd.concat([raw_df, df], ignore_index=True, sort=False)

    # Return the concatenated dataframe
    return raw_df


In [10]:
# Get the training data and get rid of some unneeded number values
# data = engineer_data_for_model(raw_data[0:100])
data = engineer_data_for_model(raw_data)
data

Unnamed: 0,tokens,trailing_whitespaces,capitalized first char,token length,is_numeric,PII label,Row,Closest PII data
0,Thinking,True,True,8,False,O,1,-1
1,innovation,True,False,10,False,O,1,-1
2,reflexion,False,False,9,False,O,1,-1
3,Avril,True,True,5,False,O,1,-1
4,2021,False,False,4,True,O,1,-1
...,...,...,...,...,...,...,...,...
276038,However,False,True,7,False,O,32,-1
276039,hindrance,True,False,9,False,O,32,-1
276040,stimulate,True,False,9,False,O,32,-1
276041,innovative,False,False,10,False,O,32,-1


In [11]:
# from sklearn.feature_extraction.text import HashingVectorizer

# hashing_vectorizer = HashingVectorizer(n_features=8, norm=None, alternate_sign=False)

# encoded_tokens = hashing_vectorizer.transform(data["tokens"]).toarray()

# encoded_tokens

In [12]:
# import hashlib

# hashed_values = [hashlib.sha1(row.tobytes()).hexdigest() for row in encoded_tokens]

# hashed_integers = [int(hash_val, 16) for hash_val in hashed_values]
# hashed_integers[0]

# data["hashed_tokens"] = hashed_integers
# data.astype({'hashed_tokens': 'int64'}).dtypes

# data

In [13]:
from sklearn.preprocessing import LabelEncoder

# Multinomial encoding 
le = LabelEncoder()
data['Y'] = le.fit_transform(data['PII label'])
data

Unnamed: 0,tokens,trailing_whitespaces,capitalized first char,token length,is_numeric,PII label,Row,Closest PII data,Y
0,Thinking,True,True,8,False,O,1,-1,12
1,innovation,True,False,10,False,O,1,-1,12
2,reflexion,False,False,9,False,O,1,-1,12
3,Avril,True,True,5,False,O,1,-1,12
4,2021,False,False,4,True,O,1,-1,12
...,...,...,...,...,...,...,...,...,...
276038,However,False,True,7,False,O,32,-1,12
276039,hindrance,True,False,9,False,O,32,-1,12
276040,stimulate,True,False,9,False,O,32,-1,12
276041,innovative,False,False,10,False,O,32,-1,12


In [14]:
# Get the unique values and print them.
unique_list = data["Y"].unique().tolist()
print(unique_list)

# Display the actual values of the encoded labels
inverse_encoded_classes = le.inverse_transform(unique_list)
print("Actual values of encoded labels:")
print(inverse_encoded_classes)

[12, 2, 8, 5, 0, 1, 11, 6, 9, 4, 10, 3, 7]
Actual values of encoded labels:
['O' 'B-NAME_STUDENT' 'I-NAME_STUDENT' 'B-URL_PERSONAL' 'B-EMAIL'
 'B-ID_NUM' 'I-URL_PERSONAL' 'B-USERNAME' 'I-PHONE_NUM' 'B-STREET_ADDRESS'
 'I-STREET_ADDRESS' 'B-PHONE_NUM' 'I-ID_NUM']


#### Test/Train Splitting

In [15]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=100)

In [16]:
# Train data
y_train = train_data["Y"].to_numpy()
X_train = train_data.drop(columns=["tokens", "PII label"])
print(f"Length train_x = {len(X_train)} \n Length y_train = {len(y_train)}")


# Test data
y_test = test_data["Y"].to_numpy()
X_test = test_data.drop(columns=["tokens", "PII label"])
print(f"Length test_x = {len(X_test)} \n Length test_y = {len(y_test)}")

Length train_x = 193230 
 Length y_train = 193230
Length test_x = 82813 
 Length test_y = 82813


In [17]:
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

#### Model Implementation

In [20]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn import metrics

# Create and train the XGBoost Classifier
clf = XGBClassifier(n_estimators=100, random_state=42, enable_categorical=True)
clf.fit(scaled_X_train, y_train)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11], got [ 0  1  2  3  4  5  6  7  8  9 10 12]

In [None]:
# Print the results
print('Training accuracy:', clf.score(scaled_X_train, y_train))
print('Test accuracy:', clf.score(scaled_X_test, y_test))

# Cross-validation
cv_scores = cross_val_score(clf, scaled_X_train, y_train, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42))
print('Cross-Validation Scores:', cv_scores)
print('Mean KFold Cross-Validation Accuracy:', cv_scores.mean())

# Classification Report
y_predictions = clf.predict(scaled_X_test)
print('\nClassification Report:')
print(classification_report(y_test, y_predictions))

#### Results Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

cnf_matrix = metrics.confusion_matrix(y_test, y_predictions)
cnf_matrix

In [None]:
class_names = [0,1,2,3,4,5,6,7,8,9,10,11]

fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')