In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df = pd.read_json('/content/drive/MyDrive/Sarcasm_Headlines_Dataset_v2.json', lines = True)
df = df.dropna()
df = df.drop(df.columns[2], axis=1)
df = df.to_numpy()
print(df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[[1 'thirtysomething scientists unveil doomsday clock of hair loss']
 [0
  'dem rep. totally nails why congress is falling short on gender, racial equality']
 [0 'eat your veggies: 9 deliciously different recipes']
 ...
 [0
  'the most beautiful acceptance speech this week came from a queer korean']
 [1 'mars probe destroyed by orbiting spielberg-gates space palace']
 [1 'dad clarifies this not a food stop']]


In [2]:
pip install -U sentence-transformers



In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
import numpy as np

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-mpnet-base-v2").to(device)
model.cuda()
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),  lr= 1e-4 ,  betas=(0, 0.9)  )
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
x = df[:2000, 1]
y = df[:2000, 0]
print(x)
print(y)
count = sum(y)
print(count)

['thirtysomething scientists unveil doomsday clock of hair loss'
 'dem rep. totally nails why congress is falling short on gender, racial equality'
 'eat your veggies: 9 deliciously different recipes' ...
 'area mom raving about phoenix airport'
 'seaworld to discontinue great white shark ride'
 '3-day waiting period leads to far more feasible murder plot']
[1 0 0 ... 1 1 1]
975


In [6]:
def balanced_subsample(x,y,subsample_size=1.0):

    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            np.random.shuffle(this_xs)

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.append(x_)
        ys.append(y_)

    xs = np.concatenate(xs)
    ys = np.concatenate(ys)

    return xs,ys

x, y = balanced_subsample(x, y)

In [7]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, SequentialSampler
from torch.utils.data import TensorDataset
batch_size = 8
num_workers = 32

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
x_train = x_train.tolist()
x_test = x_test.tolist()
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

print(y_train.shape, y_test.shape)

train_encode = tokenizer(text=x_train, return_tensors='pt', add_special_tokens=True, padding=True, truncation = True, max_length = 512)
test_encode = tokenizer(text=x_test, return_tensors='pt', add_special_tokens=True, padding=True, truncation = True, max_length = 512)

train_data = TensorDataset(train_encode['input_ids'], train_encode['attention_mask'], y_train)
test_data = TensorDataset(test_encode['input_ids'], test_encode['attention_mask'], y_test)

train_dataloader = DataLoader(train_data, sampler= SequentialSampler(train_data),
                        batch_size=batch_size, pin_memory=num_workers>0, num_workers=num_workers)
val_dataloader = DataLoader(test_data, sampler= SequentialSampler(test_data),
                        batch_size=batch_size, pin_memory=num_workers>0, num_workers=num_workers)

torch.Size([1306]) torch.Size([644])




In [8]:
def train(train_dataloader, val_dataloader, num_epochs):
  model.train()
  for epoch in range(num_epochs):
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        input_attn = batch[1].to(device)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        logits = model(input_ids,attention_mask=input_attn).logits
        l = loss(logits,labels)
        l.backward()
        optimizer.step()
        scheduler.step()

from sklearn.metrics import classification_report

def val(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            input_attn = batch[1].to(device)
            labels = batch[2].to(device)
            logits = model(input_ids, attention_mask=input_attn).logits

            # Get predicted labels
            predicted_labels = torch.argmax(logits, dim=-1)

            # Update lists
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted_labels.cpu().numpy())

    # Calculate and print classification report
    report = classification_report(all_labels, all_predictions)
    print("Classification Report:")
    print(report)


In [9]:
train(train_dataloader, val_dataloader, 5)
val(model, val_dataloader)

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.95      0.88       320
           1       0.94      0.79      0.86       324

    accuracy                           0.87       644
   macro avg       0.88      0.87      0.87       644
weighted avg       0.88      0.87      0.87       644

