# Baseline Logistic Regression Model

**Gathering Data**

In [1]:
# Add all imports related to data engineering
import json
import numpy as np
import pandas as pd
import string

In [2]:
# Import the training json file, then close the file
file = open("raw_data/train.json")
raw_data = json.load(file)
file.close()

In [3]:
pii_number_encoding = {
    'B-EMAIL': 0, 
    'B-ID_NUM': 1, 
    'B-NAME_STUDENT': 2, 
    'B-PHONE_NUM': 3, 
    'B-STREET_ADDRESS': 4, 
    'B-URL_PERSONAL': 5, 
    'B-USERNAME': 6, 
    'I-NAME_STUDENT': 7,  
    'I-PHONE_NUM': 8, 
    'I-STREET_ADDRESS': 9,  
    'I-URL_PERSONAL': 10, 
    'O': 11
}

In [4]:
# Got the txt file from here: https://gist.github.com/deekayen/4148741

# Common words List to hold common words
with open("ml-data-input/most-common-words.txt", "r") as common_words_file:
    common_tokens = [word[:-1] for word in list(common_words_file)]
    common_tokens.append("\n\n")
    common_tokens.append("\n")
    common_tokens.append(" ")

# Add punctuation to the list of commonalities
for char in list(string.punctuation):
    common_tokens.append(char)

print(common_tokens)

['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it', 'you', 'that', 'he', 'was', 'for', 'on', 'are', 'with', 'as', 'I', 'his', 'they', 'be', 'at', 'one', 'have', 'this', 'from', 'or', 'had', 'by', 'not', 'word', 'but', 'what', 'some', 'we', 'can', 'out', 'other', 'were', 'all', 'there', 'when', 'up', 'use', 'your', 'how', 'said', 'an', 'each', 'she', 'which', 'do', 'their', 'time', 'if', 'will', 'way', 'about', 'many', 'then', 'them', 'write', 'would', 'like', 'so', 'these', 'her', 'long', 'make', 'thing', 'see', 'him', 'two', 'has', 'look', 'more', 'day', 'could', 'go', 'come', 'did', 'number', 'sound', 'no', 'most', 'people', 'my', 'over', 'know', 'water', 'than', 'call', 'first', 'who', 'may', 'down', 'side', 'been', 'now', 'find', 'any', 'new', 'work', 'part', 'take', 'get', 'place', 'made', 'live', 'where', 'after', 'back', 'little', 'only', 'round', 'man', 'year', 'came', 'show', 'every', 'good', 'me', 'give', 'our', 'under', 'name', 'very', 'through', 'just', 'form', 'sentence', 'g

In [5]:
# A function to get rid of the 1000 most common used words
def common_word_drop(token_list, whitespace_list, label_list):
    for word in common_tokens:
        if word in token_list:
            indices = [i for i, x in enumerate(token_list) if x.lower() == word]
            token_list = [token_list[i] for i in range(len(token_list)) if i not in indices]
            whitespace_list = [whitespace_list[i] for i in range(len(whitespace_list)) if i not in indices]
            label_list = [label_list[i] for i in range(len(label_list)) if i not in indices]

    return token_list, whitespace_list, label_list

In [6]:
def engineer_data_for_model(data):

    # Get the first value from the data
    first_doc = data[0]

    # Get rid of common words
    # tokens, white_spaces, labels = common_word_drop(first_doc['tokens'], first_doc['trailing_whitespace'], first_doc['labels'])
    tokens, white_spaces, labels = common_word_drop(first_doc['tokens'], first_doc['trailing_whitespace'], first_doc['labels'])

    # Create the initial dataframe from the above data
    first_doc_data = {
        "tokens": tokens,
        "trailing_whitespaces": white_spaces,
        "capitalized first char": [True if label[0].isupper() else False for label in tokens],
        "token length": [len(token) for token in tokens],
        "is_numeric": [True if token.isnumeric() else False for token in tokens],
        "PII label": [pii_number_encoding[label] for label in labels]
    }
    raw_df = pd.DataFrame(first_doc_data)

    # Loop till the end of the data
    for document in data[1: len(data) - 1]:

        # Get rid of common words
        # tokens, white_spaces, labels = common_word_drop(document['tokens'], document['trailing_whitespace'], document['labels'])
        tokens, white_spaces, labels = common_word_drop(document['tokens'], document['trailing_whitespace'], document['labels'])

        # Collect the data in the same way
        doc_data = {
            "tokens": tokens,
            "trailing_whitespaces": white_spaces,
            "capitalized first char": [True if label[0].isupper() else False for label in tokens],
            "token length": [len(token) for token in tokens],
            "is_numeric": [True if token.isnumeric() else False for token in tokens],
            "PII label": [pii_number_encoding[label] for label in labels]
        }
        df = pd.DataFrame(doc_data)

        # Concatenate all the data into one single dataframe
        raw_df = pd.concat([raw_df, df], ignore_index=True, sort=False)

    # Return the concatenated dataframe
    return raw_df


In [7]:
# Get the training data and get rid of some unneeded number values
data = engineer_data_for_model(raw_data[0:20]).reset_index()

# Show the data
data

Unnamed: 0,index,tokens,trailing_whitespaces,capitalized first char,token length,is_numeric,PII label
0,0,Thinking,True,True,8,False,11
1,1,innovation,True,False,10,False,11
2,2,reflexion,False,False,9,False,11
3,3,Avril,True,True,5,False,11
4,4,2021,False,False,4,True,11
...,...,...,...,...,...,...,...
28295,28295,reality,False,False,7,False,11
28296,28296,because,True,False,7,False,11
28297,28297,already,True,False,7,False,11
28298,28298,creating,True,False,8,False,11


In [10]:
# from sklearn.feature_extraction.text import HashingVectorizer
import pickle
from gensim.models import KeyedVectors

model_path = "GoogleNews-vectors-negative300.bin.gz"
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')

def get_token_vector(token):
  try:
    return word2vec_model[token]
  except KeyError:
    return np.zeros(word2vec_model.vector_size)


In [11]:
vectorized_data = pd.DataFrame({
  'vectors': data['tokens'].apply(lambda tokens: [get_token_vector(token) for token in tokens])
})

vectorized_data

Unnamed: 0,vectors
0,"[[-0.2421875, 0.14550781, 0.026855469, 0.00759..."
1,"[[-0.22558594, -0.01953125, 0.09082031, 0.2373..."
2,"[[-0.30078125, 0.18945312, -0.03491211, 0.125,..."
3,"[[-0.10595703, 0.21386719, 0.118652344, -0.031..."
4,"[[0.106933594, -0.10546875, 0.053222656, 0.069..."
...,...
28295,"[[-0.30078125, 0.18945312, -0.03491211, 0.125,..."
28296,"[[-0.203125, 0.053222656, 0.109375, 0.21777344..."
28297,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
28298,"[[-0.20800781, 0.034179688, 0.025756836, 0.179..."


In [12]:
def flatten_matrix(matrix):
    return np.ravel(matrix)

flattened_data = vectorized_data["vectors"].apply(flatten_matrix)

flattened_data

0        [-0.2421875, 0.14550781, 0.026855469, 0.007598...
1        [-0.2255859375, -0.01953125, 0.0908203125, 0.2...
2        [-0.30078125, 0.18945312, -0.03491211, 0.125, ...
3        [-0.10595703, 0.21386719, 0.118652344, -0.0314...
4        [0.106933594, -0.10546875, 0.053222656, 0.0698...
                               ...                        
28295    [-0.30078125, 0.189453125, -0.034912109375, 0....
28296    [-0.203125, 0.05322265625, 0.109375, 0.2177734...
28297    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
28298    [-0.2080078125, 0.0341796875, 0.0257568359375,...
28299    [-0.2255859375, -0.01953125, 0.0908203125, 0.2...
Name: vectors, Length: 28300, dtype: object

In [None]:
from itertools import zip_longest

max_len = max(flattened_data.apply(len))

padded_sequences = flattened_data.apply(lambda vectors: list(zip_longest(vectors, fillvalue=np.zeros_like(vectors[0]))))

vectorized_data_padded = pd.DataFrame(padded_sequences.tolist(), columns=[f'feature_{i}' for i in range(max_len)])

print(vectorized_data_padded)

In [None]:
# Concatenate the flattened features with the labels
final_data = pd.concat([flattened_data, data['PII label']], axis=1)

In [None]:
#import hashlib

#hashed_values = [hashlib.sha256(row.tobytes()).hexdigest() for row in encoded_tokens]

#hashed_integers = [int(hash_val, 16) for hash_val in hashed_values]

# data["hashed_tokens"] = hashed_integers

# data

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

**One-hot Encoding**

In [None]:
# Train data
y_train = train_data["PII label"].to_numpy()
x_train = train_data.drop(columns=["tokens", "index", "PII label"])


# Test data
y_test = test_data["PII label"].to_numpy()
x_test = test_data.drop(columns=["tokens", "index", "PII label"])

In [None]:
print(f"Length train_x = {len(x_train)} \n Length train_y = {len(y_train)}")

In [None]:
print(f"Length test_x = {len(x_test)} \n Length test_y = {len(y_test)}")

In [None]:
x_train

In [None]:
x_test

**Model**

In [None]:
# import needed classes
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# instantiate the model (using the default parameters)
logreg = LogisticRegression(multi_class='multinomial', solver='sag', class_weight='balanced')

# fit the model with data
logreg.fit(x_train, y_train)

y_pred = logreg.predict(x_test)

In [None]:

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
class_names = [0,1,2,3,4,5,6,7,8,9,10,11]

fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

# Text(0.5,257.44,'Predicted label')

In [None]:
{
    'B-EMAIL': 0, 
    'B-ID_NUM': 1, 
    'B-NAME_STUDENT': 2, 
    'B-PHONE_NUM': 3, 
    'B-STREET_ADDRESS': 4, 
    'B-URL_PERSONAL': 5, 
    'B-USERNAME': 6, 
    'I-NAME_STUDENT': 7,  
    'I-PHONE_NUM': 8, 
    'I-STREET_ADDRESS': 9,  
    'I-URL_PERSONAL': 10, 
    'O': 11
}

In [None]:
target_names = list(pii_number_encoding.keys())

print(classification_report(y_test, y_pred))