In [1]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
df = pd.read_csv('/kaggle/input/hackforimpact2024/final_expanded_dataset.csv')

# Define the categories and their encoding
tags = ['Funding', 'Operations', 'Misc', 'Food', 'Equipment', 'Programming', 'Travel']
map = {tag: i for i, tag in enumerate(tags)}

# Encode the categories with a safety check
def encode_tags(x):
    # Return the encoded value if x exists in encode_dict, otherwise return a placeholder or error code
    return map.get(x, -1)  # -1 or any other value you choose as a placeholder for unknown categories

df['Tags_Encoded'] = df['Tags'].apply(lambda x: encode_tags(x))

df

Unnamed: 0,Memo,Tags,Tags_Encoded
0,TRANSFER TO STATE HIGH HACK CLUB,Funding,0
1,NAME-CHEAP.COM* 8SG11P,Operations,1
2,TRANSFER FROM HACK CLUB HQ,Funding,0
3,NAME-CHEAP.COM,Operations,1
4,HACK CLUB BANK FEE (MISTAKE BY BANK),Misc,2
...,...,...,...
24995,Office furniture Chair,Equipment,4
24996,Laptop purchase - Lenovo ThinkPad,Equipment,4
24997,Tech supplies order,Equipment,4
24998,Tech supplies order,Equipment,4


In [4]:
df['Tags'].value_counts()

Tags
Programming    3820
Equipment      3808
Travel         3776
Operations     3711
Food           3651
Misc           3128
Funding        3106
Name: count, dtype: int64

In [16]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [6]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        memo = str(self.data.Memo[index])
        memo = " ".join(memo.split())
        inputs = self.tokenizer.encode_plus(
            memo,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.Tags_Encoded[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [7]:
train_size = 0.7
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (25000, 3)
TRAIN Dataset: (17500, 3)
TEST Dataset: (7500, 3)


In [8]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [9]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 7)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [10]:
model = DistillBERTClass()
model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [11]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [12]:
def calculate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [13]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calculate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%TRAIN_BATCH_SIZE==0:
          loss_step = tr_loss/nb_tr_steps
          accu_step = (n_correct*100)/nb_tr_examples
          print(f"Batch Training Loss: {loss_step}")
          print(f"Batch Training Accuracy: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [14]:
for epoch in range(EPOCHS):
    print(f'EPOCH {epoch}:\n')
    print('-'*128)
    train(epoch)

EPOCH 0:

--------------------------------------------------------------------------------------------------------------------------------




Batch Training Loss: 1.9759502410888672
Batch Training Accuracy: 10.9375
Batch Training Loss: 1.5817774864343497
Batch Training Accuracy: 61.39423076923077
Batch Training Loss: 1.0476430186698602
Batch Training Accuracy: 78.35513565891473
Batch Training Loss: 0.7659604588012003
Batch Training Accuracy: 84.577396373057
Batch Training Loss: 0.6084414187833261
Batch Training Accuracy: 87.77358949416342
The Total Accuracy for Epoch 0: 88.35428571428571
Training Loss Epoch: 0.5779485600952902
Training Accuracy Epoch: 88.35428571428571
EPOCH 1:

--------------------------------------------------------------------------------------------------------------------------------
Batch Training Loss: 0.16220742464065552
Batch Training Accuracy: 96.875
Batch Training Loss: 0.10188724582011884
Batch Training Accuracy: 97.88461538461539
Batch Training Loss: 0.09517648651502854
Batch Training Accuracy: 97.90455426356588
Batch Training Loss: 0.08916577992121173
Batch Training Accuracy: 97.96794041450777


In [17]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss = 0; nb_tr_steps = 0; nb_tr_examples = 0;
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calculate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%VALID_BATCH_SIZE==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Batch Validation Loss: {loss_step}")
                print(f"Batch Validation Accuracy: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu


In [18]:
# Fine-Tuned Model
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Batch Validation Loss: 0.5136920809745789
Batch Validation Accuracy: 50.0
Batch Validation Loss: 0.028118200543703453
Batch Validation Accuracy: 98.48484848484848
Batch Validation Loss: 0.08958293956776078
Batch Validation Accuracy: 96.92307692307692
Batch Validation Loss: 0.07939069077478166
Batch Validation Accuracy: 97.42268041237114
Batch Validation Loss: 0.09454923810603372
Batch Validation Accuracy: 96.89922480620154
Batch Validation Loss: 0.1001064675897707
Batch Validation Accuracy: 96.8944099378882
Batch Validation Loss: 0.10475588827804116
Batch Validation Accuracy: 96.89119170984456
Batch Validation Loss: 0.0948363207715253
Batch Validation Accuracy: 97.11111111111111
Batch Validation Loss: 0.08371389490472522
Batch Validation Accuracy: 97.47081712062257
Batch Validation Loss: 0.085025

In [19]:
# Saving the files for re-use

# output_model_file = '/'
# output_vocab_file = '/'

model_to_save = model
torch.save(model.state_dict(), '/kaggle/working/best_synthetic.pkl')
# tokenizer.save_vocabulary(output_vocab_file)

print('Model saved successfully.')

Model saved successfully.


In [27]:
base_model = DistillBERTClass()
base_model.to(device)
# base_model = torch.nn.DataParallel(model, device_ids = [0,1]).to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [28]:
# Base Model
acc = valid(base_model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

Batch Validation Loss: 1.9485548734664917
Batch Validation Accuracy: 50.0
Batch Validation Loss: 1.9528504104325266
Batch Validation Accuracy: 12.121212121212121
Batch Validation Loss: 1.946119477198674
Batch Validation Accuracy: 16.923076923076923
Batch Validation Loss: 1.944229782242136
Batch Validation Accuracy: 15.97938144329897
Batch Validation Loss: 1.9435420378234036
Batch Validation Accuracy: 15.891472868217054
Batch Validation Loss: 1.9437079822054562
Batch Validation Accuracy: 17.080745341614907
Batch Validation Loss: 1.9465815551540395
Batch Validation Accuracy: 16.83937823834197
Batch Validation Loss: 1.9465919219122993
Batch Validation Accuracy: 16.444444444444443
Batch Validation Loss: 1.9492624274487624
Batch Validation Accuracy: 15.56420233463035
Batch Validation Loss: 1.947484680113083
Batch Validation Accuracy: 15.22491349480969
Batch Validation Loss: 1.9466168795047891
Batch Validation Accuracy: 15.88785046728972
Batch Validation Loss: 1.9473867004383725
Batch Valida