In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing cleaned data

In [2]:
dataset = pd.read_csv('data/LATAM-Data-cleaned.csv')
dataset.head(2)

Unnamed: 0,Normalized Supplier Name,Country Name,Strategic Region,Requestor Name,Preparer Name,Level 1,Level 2,Level 3,Business Unit,Legal Entity,Cost Center (Base Level),Cost Center (Level 4),Cost Center (Level 5),GL Desc (Level 4),Invoice Source,GL Description,Project
0,20 TABELIAO DE NOTAS DA CAPITAL,Brazil,LATAM,Daniela Fechio,Cindy Eurie,Uncategorized,Uncategorized,Uncategorized,Facilities,Facebook Servicos Online Do Brasil Ltda,Non-Bay Area Facilities & Operations Group,FCS G&A,General & Administrative,Postage and courier,LETTERBOX,Postage and courier,31505 - Sao Paulo Birmann 32
1,20 TABELIAO DE NOTAS DA CAPITAL,Brazil,LATAM,Daniela Fechio,Cindy Eurie,Uncategorized,Uncategorized,Uncategorized,Facilities,Facebook Servicos Online Do Brasil Ltda,Non-Bay Area Facilities & Operations Group,FCS G&A,General & Administrative,Postage and courier,LETTERBOX,Postage and courier,31505 - Sao Paulo Birmann 32


# 1. Creating handled data for each level

There are 11 levels in the first level variable, 56 in the second level variable, and 158 in the third level variable. It seems that the order of granularity is increasing as we go down the levels, which enables us to use previous levels as features for the next level.

In [3]:
print(f"Level 1: {len(dataset['Level 1'].unique())} different levels")
print(f"Level 2: {len(dataset['Level 2'].unique())} different levels")
print(f"Level 3: {len(dataset['Level 3'].unique())} different levels")

Level 1: 11 different levels
Level 2: 56 different levels
Level 3: 158 different levels


## 1.1. First level

In [4]:
X_lvl_1 = dataset.drop(["Level 1", "Level 2", "Level 3"], axis=1)
y_lvl_1 = dataset["Level 1"]

X_lvl_1.shape, y_lvl_1.shape

((11265, 14), (11265,))

In [5]:
# turn each row into a single string (in a patter of COLUMN_NAME, COLUMN_VALUE, COLUMN_NAME, COLUMN_VALUE, ...)
X_lvl_1 = X_lvl_1.apply(lambda row: ", ".join([f"{col}, {row[col]}" for col in X_lvl_1.columns]), axis=1)
X_lvl_1[0], X_lvl_1.shape

('Normalized Supplier Name, 20 TABELIAO DE NOTAS DA CAPITAL, Country Name, Brazil, Strategic Region, LATAM, Requestor Name, Daniela Fechio, Preparer Name, Cindy Eurie, Business Unit, Facilities, Legal Entity, Facebook Servicos Online Do Brasil Ltda, Cost Center (Base Level), Non-Bay Area Facilities & Operations Group, Cost Center (Level 4), FCS G&A, Cost Center (Level 5), General & Administrative, GL Desc (Level 4), Postage and courier, Invoice Source, LETTERBOX, GL Description, Postage and courier, Project, 31505 - Sao Paulo Birmann 32',
 (11265,))

In [6]:
# map each target label to an integer and create new column with the integer label
y_lvl_1_labels = y_lvl_1.astype('category').cat.codes
y_lvl_1_labels.shape

(11265,)

In [7]:
lvl_1_df = pd.DataFrame({"text": X_lvl_1, "label": y_lvl_1_labels})
lvl_1_df.head(2)

Unnamed: 0,text,label
0,"Normalized Supplier Name, 20 TABELIAO DE NOTAS...",10
1,"Normalized Supplier Name, 20 TABELIAO DE NOTAS...",10


In [8]:
# save the dataframe to a csv file
lvl_1_df.to_csv("data/lvl_1_df.csv", index=False)

In [9]:
# create an object mapping each label to its integer value
lvl_1_labels = dict(enumerate(y_lvl_1.astype('category').cat.categories))

# save label as json file
import json
with open('data/lvl_1_labels.json', 'w') as fp:
    json.dump(lvl_1_labels, fp)

## 1.2. Second level

In [10]:
X_lvl_2 = dataset.drop(["Level 2", "Level 3"], axis=1)
y_lvl_2 = dataset["Level 2"]

X_lvl_2.shape, y_lvl_2.shape

((11265, 15), (11265,))

In [11]:
# turn each row into a single string (in a patter of COLUMN_NAME, COLUMN_VALUE, COLUMN_NAME, COLUMN_VALUE, ...)
X_lvl_2 = X_lvl_2.apply(lambda row: ", ".join([f"{col}, {row[col]}" for col in X_lvl_2.columns]), axis=1)
X_lvl_2[0], X_lvl_2.shape

('Normalized Supplier Name, 20 TABELIAO DE NOTAS DA CAPITAL, Country Name, Brazil, Strategic Region, LATAM, Requestor Name, Daniela Fechio, Preparer Name, Cindy Eurie, Level 1, Uncategorized, Business Unit, Facilities, Legal Entity, Facebook Servicos Online Do Brasil Ltda, Cost Center (Base Level), Non-Bay Area Facilities & Operations Group, Cost Center (Level 4), FCS G&A, Cost Center (Level 5), General & Administrative, GL Desc (Level 4), Postage and courier, Invoice Source, LETTERBOX, GL Description, Postage and courier, Project, 31505 - Sao Paulo Birmann 32',
 (11265,))

In [12]:
# map each target label to an integer and create new column with the integer label
y_lvl_2_labels = y_lvl_2.astype('category').cat.codes
y_lvl_2_labels.shape

(11265,)

In [13]:
lvl_2_df = pd.DataFrame({"text": X_lvl_2, "label": y_lvl_2_labels})
lvl_2_df.head(2)

Unnamed: 0,text,label
0,"Normalized Supplier Name, 20 TABELIAO DE NOTAS...",54
1,"Normalized Supplier Name, 20 TABELIAO DE NOTAS...",54


In [14]:
# save the dataframe to a csv file
lvl_2_df.to_csv("data/lvl_2_df.csv", index=False)

In [15]:
# create an object mapping each label to its integer value
lvl_2_labels = dict(enumerate(y_lvl_2.astype('category').cat.categories))

# save label as json file
import json
with open('data/lvl_2_labels.json', 'w') as fp:
    json.dump(lvl_2_labels, fp)

## 1.3. Third level

In [16]:
X_lvl_3 = dataset.drop(["Level 3"], axis=1)
y_lvl_3 = dataset["Level 3"]

X_lvl_3.shape, y_lvl_3.shape

((11265, 16), (11265,))

In [17]:
# turn each row into a single string (in a patter of COLUMN_NAME, COLUMN_VALUE, COLUMN_NAME, COLUMN_VALUE, ...)
X_lvl_3 = X_lvl_3.apply(lambda row: ", ".join([f"{col}, {row[col]}" for col in X_lvl_3.columns]), axis=1)
X_lvl_3[0], X_lvl_3.shape

('Normalized Supplier Name, 20 TABELIAO DE NOTAS DA CAPITAL, Country Name, Brazil, Strategic Region, LATAM, Requestor Name, Daniela Fechio, Preparer Name, Cindy Eurie, Level 1, Uncategorized, Level 2, Uncategorized, Business Unit, Facilities, Legal Entity, Facebook Servicos Online Do Brasil Ltda, Cost Center (Base Level), Non-Bay Area Facilities & Operations Group, Cost Center (Level 4), FCS G&A, Cost Center (Level 5), General & Administrative, GL Desc (Level 4), Postage and courier, Invoice Source, LETTERBOX, GL Description, Postage and courier, Project, 31505 - Sao Paulo Birmann 32',
 (11265,))

In [18]:
# map each target label to an integer and create new column with the integer label
y_lvl_3_labels = y_lvl_3.astype('category').cat.codes
y_lvl_3_labels.shape

(11265,)

In [19]:
lvl_3_df = pd.DataFrame({"text": X_lvl_3, "label": y_lvl_3_labels})
lvl_3_df.head(2)

Unnamed: 0,text,label
0,"Normalized Supplier Name, 20 TABELIAO DE NOTAS...",154
1,"Normalized Supplier Name, 20 TABELIAO DE NOTAS...",154


In [20]:
# save the dataframe to a csv file
lvl_3_df.to_csv("data/lvl_3_df.csv", index=False)

In [21]:
# create an object mapping each label to its integer value
lvl_3_labels = dict(enumerate(y_lvl_3.astype('category').cat.categories))

# save label as json file
import json
with open('data/lvl_3_labels.json', 'w') as fp:
    json.dump(lvl_3_labels, fp)