In [1]:
import pandas as pd
import torch
from torch import cuda
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
df = pd.read_csv('final-1.csv')
df.rename(columns={'Tags': 'category'}, inplace=True)
categories = ['Funding', 'Operations', 'Misc', 'Food', 'Equipment', 'Programming', 'Travel']
encode_dict = {category: i for i, category in enumerate(categories)}
def encode_cat(x):
    return encode_dict.get(x, -1)
df['ENCODED_CAT'] = df['category'].apply(lambda x: encode_cat(x))
print(df.head())

                                   Memo    category  ENCODED_CAT
0      TRANSFER TO STATE HIGH HACK CLUB     Funding            0
1                NAME-CHEAP.COM* 8SG11P  Operations            1
2            TRANSFER FROM HACK CLUB HQ     Funding            0
3                        NAME-CHEAP.COM  Operations            1
4  HACK CLUB BANK FEE (MISTAKE BY BANK)        Misc            2


In [3]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [4]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        title = str(self.data.Memo[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODED_CAT[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [5]:
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (1627, 3)
TRAIN Dataset: (1302, 3)
TEST Dataset: (325, 3)


In [6]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 7)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [7]:
model = DistillBERTClass()
model.to(device)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
def calculate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [8]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calculate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%TRAIN_BATCH_SIZE==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [9]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 1.951589584350586
Training Accuracy per 5000 steps: 0.0
Training Loss per 5000 steps: 1.9311297178268432
Training Accuracy per 5000 steps: 0.0
Training Loss per 5000 steps: 1.9212447934680514
Training Accuracy per 5000 steps: 8.333333333333334
Training Loss per 5000 steps: 1.914559034200815
Training Accuracy per 5000 steps: 11.538461538461538
Training Loss per 5000 steps: 1.9137854716357063
Training Accuracy per 5000 steps: 17.647058823529413
Training Loss per 5000 steps: 1.9165904692241125
Training Accuracy per 5000 steps: 17.857142857142858
Training Loss per 5000 steps: 1.9043079423904419
Training Accuracy per 5000 steps: 23.0
Training Loss per 5000 steps: 1.891768952895855
Training Accuracy per 5000 steps: 27.586206896551722
Training Loss per 5000 steps: 1.8884988047859885
Training Accuracy per 5000 steps: 27.272727272727273
Training Loss per 5000 steps: 1.8620516061782837
Training Accuracy per 5000 steps: 31.756756756756758
Training Loss per 5000 steps

In [10]:
def valid(model, testing_loader, tokenizer, categories):
    model.eval()
    n_correct = 0
    total_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0

    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)

            outputs = model(ids, mask)
            loss = loss_function(outputs, targets)
            total_loss += loss.item()
            big_val, big_idx = torch.max(outputs, dim=1)

            n_correct += (big_idx == targets).sum().item()
            nb_tr_steps += 1
            nb_tr_examples += targets.size(0)

            decoded_inputs = tokenizer.decode(ids[0], skip_special_tokens=True)
            predicted_class = categories[big_idx[0].item()]
            true_class = categories[targets[0].item()]

            print(f"\nInput Text: {decoded_inputs}")
            print(f"Predicted Class: {predicted_class}, True Class: {true_class}")

    avg_loss = total_loss / nb_tr_steps
    accuracy = (n_correct * 100) / nb_tr_examples
    print(f"\nValidation Loss: {avg_loss}")
    print(f"Validation Accuracy: {accuracy}%")

    return accuracy

In [11]:
categories = ['Funding', 'Operations', 'Misc', 'Food', 'Equipment', 'Programming', 'Travel']
acc = valid(model, testing_loader, tokenizer, categories)
print("Accuracy on test data = %0.2f%%" % acc)


Input Text: AMAZON. COM * 7O9S89HT3 AMZN
Predicted Class: Misc, True Class: Misc

Input Text: Care Package
Predicted Class: Misc, True Class: Misc

Input Text: FISCAL SPONSORSHIP
Predicted Class: Funding, True Class: Funding

Input Text: Hackathon grant from Hack Club
Predicted Class: Funding, True Class: Funding

Input Text: TRANSFER FROM ACCOUNT TO CARD BALANCE
Predicted Class: Funding, True Class: Funding

Input Text: Porkbun : venturedglobal. org
Predicted Class: Operations, True Class: Operations

Input Text: Reimbursement : Caleb's Late Night Feast
Predicted Class: Funding, True Class: Funding

Input Text: Porkbun : nabadminton. org
Predicted Class: Operations, True Class: Operations

Input Text: INVOICE TO HUDSON RIVER TRADING LLC
Predicted Class: Funding, True Class: Funding

Input Text: Disbursement to zero out SoM Sticker Shipments
Predicted Class: Misc, True Class: Misc

Input Text: STAPLES 0200
Predicted Class: Equipment, True Class: Misc

Input Text: Porkbun : hackforhome

In [12]:
# Saving the files for re-use

# output_model_file = '/'
# output_vocab_file = '/'
model_to_save = model
torch.save(model.state_dict(), '/content/best.pkl')
# tokenizer.save_vocabulary(output_vocab_file)

print('Model saved successfully.')

Model saved successfully.
