### Read the data

In [1]:
import pandas as pd
from pprint import pprint
from datasets import Dataset
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, DistilBertForSequenceClassification, AdamW
)
from tqdm import tqdm_notebook

In [2]:
df = pd.read_csv("data/training_data.csv")
# test_df = pd.read_csv("data/testing_data.csv")

In [3]:
df.sample()

Unnamed: 0,id,premise,hypothesis,label
3540,45858034b1,It doesn't seem expensive--they use it in Bang...,That's one of the most expensive options.,2


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df[["premise","hypothesis"]], df["label"], test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [5]:
print(f"len of train, val and test are {len(X_train), len(X_val), len(X_test)}")

len of train, val and test are (4396, 1100, 1374)


In [6]:
X_train.sample()

Unnamed: 0,premise,hypothesis
1440,GAO's Web site (www.gao.gov) contains abstract...,The GAO has received many complaints due to la...


In [7]:
y_train.sample()

5301    2
Name: label, dtype: int64

___

### Tokenizer and model init

In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

___

### Checking

In [10]:
example = tokenizer.encode("this is it","for now atleast")
print(f"after encoding: {example}")
example = tokenizer.decode(example)
print(f"after decoding: {example}")

after encoding: [101, 2023, 2003, 2009, 102, 2005, 2085, 2012, 19738, 3367, 102]
after decoding: [CLS] this is it [SEP] for now atleast [SEP]


In [11]:
example_tokenizer = tokenizer("this is it","for now atleast")
example_tokenizer

{'input_ids': [101, 2023, 2003, 2009, 102, 2005, 2085, 2012, 19738, 3367, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

___

### Data Prep

In [12]:
# def tokenize_function(example):
#     return tokenizer(example["premise"], example["hypothesis"], truncation=True, padding=True, return_tensors="pt")

In [13]:
def tokenize_function(text):
    text = text.values.tolist()
    return tokenizer(text=text, padding=True, return_tensors="pt")

In [14]:
train_encoded = tokenize_function(X_train)
val_encoded = tokenize_function(X_val)
test_encoded = tokenize_function(X_test)

In [16]:
# Convert labels to tensors
y_train = torch.tensor(y_train.values)
y_val = torch.tensor(y_val.values)
y_test = torch.tensor(y_test.values)

In [17]:
print(type(train_encoded))
train_encoded.keys()

<class 'transformers.tokenization_utils_base.BatchEncoding'>


dict_keys(['input_ids', 'attention_mask'])

In [18]:
# label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

___

### Training

In [19]:
# Training loop
num_epochs = 3
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [21]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [23]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for i in tqdm_notebook(range(0, len(train_encoded['input_ids']), batch_size), desc=f"Epoch {epoch + 1}", dynamic_ncols=True):
        input_ids = train_encoded['input_ids'][i:i+batch_size].to(device)
        attention_mask = train_encoded['attention_mask'][i:i+batch_size].to(device)
        labels = y_train[i:i+batch_size].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_encoded['input_ids']) / batch_size
    print(f"Epoch {epoch + 1} - Train Loss: {train_loss:.4f}")

    # Validation loop
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    for i in tqdm_notebook(range(0, len(val_encoded['input_ids']), batch_size), desc=f"Validation - Epoch {epoch + 1}", dynamic_ncols=True):
        input_ids = val_encoded['input_ids'][i:i+batch_size].to(device)
        attention_mask = val_encoded['attention_mask'][i:i+batch_size].to(device)
        labels = y_val[i:i+batch_size].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss /= len(val_encoded['input_ids']) / batch_size
    accuracy = correct / total * 100.0
    print(f"Validation - Epoch {epoch + 1} - Loss: {val_loss:.4f} - Accuracy: {accuracy:.2f}%")


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(0, len(train_encoded['input_ids']), batch_size), desc=f"Epoch {epoch + 1}", dynamic_ncols=True):


Epoch 1:   0%|          | 0/138 [00:00<?, ?it/s]

Epoch 1 - Train Loss: 0.8990


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(0, len(val_encoded['input_ids']), batch_size), desc=f"Validation - Epoch {epoch + 1}", dynamic_ncols=True):


Validation - Epoch 1:   0%|          | 0/35 [00:00<?, ?it/s]

Validation - Epoch 1 - Loss: 0.8797 - Accuracy: 61.36%


Epoch 2:   0%|          | 0/138 [00:00<?, ?it/s]

Epoch 2 - Train Loss: 0.7121


Validation - Epoch 2:   0%|          | 0/35 [00:00<?, ?it/s]

Validation - Epoch 2 - Loss: 0.8867 - Accuracy: 62.64%


Epoch 3:   0%|          | 0/138 [00:00<?, ?it/s]

Epoch 3 - Train Loss: 0.5539


Validation - Epoch 3:   0%|          | 0/35 [00:00<?, ?it/s]

Validation - Epoch 3 - Loss: 0.9062 - Accuracy: 65.45%


In [24]:
# Save the trained model
model.save_pretrained("model_checkpoint/distilbert_classifier")

___

### Inference

In [27]:
# import torch
# # Load the trained model
# model = DistilBertForSequenceClassification.from_pretrained("distilbert_classifier")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# # Tokenize the test data
# test_encoded = tokenize_text(X_test["Premise"] + " " + X_test["Hypothesis"])

# # Convert labels to tensors
# y_test = torch.tensor(y_test)

from tqdm.notebook import tqdm

# Evaluate the model on the test data
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for i in tqdm(range(0, len(test_encoded['input_ids']), batch_size), desc="Testing", dynamic_ncols=True):
        input_ids = test_encoded['input_ids'][i:i+batch_size].to(device)
        attention_mask = test_encoded['attention_mask'][i:i+batch_size].to(device)
        labels = y_test[i:i+batch_size].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()

        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_loss /= len(test_encoded['input_ids']) / batch_size
accuracy = correct / total * 100.0

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {accuracy:.2f}%")


Testing:   0%|          | 0/43 [00:00<?, ?it/s]

Test Loss: 0.8877
Test Accuracy: 63.76%


In [42]:
X_test

Unnamed: 0,index,premise,hypothesis
0,6574,Benchmarked by U.S.,Canada benchmarked it.
1,2934,Because the paper did not say that.,The paper did not state as much.
2,5931,"The sunlight, piercing through the branches, t...",The auburn of her hair became golden then the ...
3,3251,"All of them slept in one cave on animal skins,...",They used multiple clay pots to cook their food.
4,2134,looking at that and you know and if it's if it...,I watched Thirty Something a few times because...
...,...,...,...
1369,684,Tommy felt his ascendancy less sure than a mom...,A moment ago his ascendancy was certain.
1370,3951,Abortive countrywide revolts,The country is in unrest.
1371,389,i mean that's a real attractive option if you ...,The phone modem was easy to set up and use.
1372,4543,Took forever.,Was quick


In [43]:
# # Tokenize the test data
# test_encoded = tokenize_text(X_test["Premise"] + " " + X_test["Hypothesis"])

# # Convert labels to tensors
# y_test = torch.tensor(y_test)

# Evaluate the model on the test data and print results
model.eval()
test_loss = 0.0
correct = 0
total = 0

print("\nTest Results:")

with torch.no_grad():
    for i in tqdm(range(0, len(test_encoded['input_ids']), batch_size), desc="Testing", dynamic_ncols=True):
        input_ids = test_encoded['input_ids'][i:i+batch_size].to(device)
        attention_mask = test_encoded['attention_mask'][i:i+batch_size].to(device)
        labels = y_test[i:i+batch_size].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        test_loss += loss.item()

        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Get the corresponding input sentences, predicted labels, and actual labels
        input_sentences = X_test.iloc[i:i+batch_size]
        predicted_labels = predicted.tolist()
        actual_labels = labels.tolist()
        print(type(input_sentences))
        print(input_sentences.columns)
        # Print the results for this batch
        for j in range(len(input_sentences)):
            print(f"Input Sentences: {input_sentences['premise'].iloc[j]} | {input_sentences.iloc[j][['hypothesis']]}")
            print(f"Predicted Label: {predicted_labels[j]}, Actual Label: {actual_labels[j]}")
            print()

test_loss /= len(test_encoded['input_ids']) / batch_size
accuracy = correct / total * 100.0

print(f"\nTest Loss: {test_loss:.4f}")
print(f"Test Accuracy: {accuracy:.2f}%")



Test Results:


Testing:   0%|          | 0/43 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
Index(['index', 'premise', 'hypothesis'], dtype='object')
Input Sentences: Benchmarked by U.S. | hypothesis    Canada benchmarked it.
Name: 0, dtype: object
Predicted Label: 0, Actual Label: 2

Input Sentences: Because the paper did not say that. | hypothesis    The paper did not state as much.
Name: 1, dtype: object
Predicted Label: 0, Actual Label: 0

Input Sentences: The sunlight, piercing through the branches, turned the auburn of her hair to quivering gold.  | hypothesis    The auburn of her hair became golden then the ...
Name: 2, dtype: object
Predicted Label: 0, Actual Label: 0

Input Sentences: All of them slept in one cave on animal skins, a single large clay pot cooked all of their food. | hypothesis    They used multiple clay pots to cook their food.
Name: 3, dtype: object
Predicted Label: 0, Actual Label: 2

Input Sentences: looking at that and you know and if it's if it's funny or if it keeps my interest if it's exciting i'll watch it