In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('/kaggle/input/llm-dataset/filtered_data_clean.csv')

In [5]:
data.head()

Unnamed: 0,text,label,clean_text
0,We should keep the Electoral College for a num...,0,keep electoral college number reasons usually ...
1,Limiting car usage can actually be effective b...,0,limiting car usage actually effective belief l...
2,"Dear Principal,\n\nAfter school or during scho...",0,dear principal school school activities like b...
3,"Many people think that aliens are real, but th...",0,many people think aliens real theyre many peop...
4,I think if a student has a C average he or she...,0,think student c average may able participate s...


In [6]:
texts = data['clean_text'].tolist()
labels = data['label'].tolist()

In [7]:
train_texts = texts[:25000] + texts[120001:120001+25000]
train_labels = labels[:25000] + labels[120001:120001+25000]

In [8]:
from transformers import XLNetForSequenceClassification, XLNetTokenizer
model_name = "xlnet-base-cased"
model = XLNetForSequenceClassification.from_pretrained(model_name)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenizer = XLNetTokenizer.from_pretrained("/kaggle/input/tokenizer-xlnet")

In [10]:
train_texts = [str(text) for text in train_texts]

In [11]:
tokenized_texts = tokenizer(train_texts, padding=True, truncation=True, max_length=256, return_tensors="pt")

In [12]:
import torch
label_tensors = torch.tensor(train_labels, dtype=torch.long)

In [13]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(tokenized_texts.input_ids, tokenized_texts.attention_mask, label_tensors)


In [14]:
import random 
indices = list(range(len(dataset)))
random.shuffle(indices)

In [15]:
dataset = TensorDataset(*[torch.index_select(t, 0, torch.tensor(indices)) for t in dataset.tensors])


In [16]:
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [17]:
count_0 = 0
count_1 = 0

for batch in dataloader:
    labels = batch[-1]  

    for label in labels:
        if label.item() == 1:
            count_1 += 1
        else:
            count_0 += 1
    break

print("Count of label 0:", count_0)
print("Count of label 1:", count_1)

Count of label 0: 13
Count of label 1: 19


In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [19]:
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [20]:
learning_rate = 5e-5

In [21]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [22]:
from tqdm.autonotebook import tqdm
from sklearn.metrics import accuracy_score



In [23]:
lossi = []

In [24]:
for epoch in range(1):
    loss_ = 0
    predictions = []
    true_labels =  []

    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        loss_ += loss.item()

        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

    loss_ = loss_/len(dataloader)
    print(f"Epoch {epoch + 1} - Training Loss: {loss_:.4f}")

    accuracy = accuracy_score(true_labels, predictions)
    print(f"Epoch {epoch + 1} - Training Accuracy: {accuracy:.4f}")

    lossi.append(loss_)

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.0572
Epoch 1 - Training Accuracy: 0.9812


In [25]:
model.to('cpu')

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [26]:
torch.save(model, 'model_XLNet.pt')

In [27]:
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [28]:
for epoch in range(1):
    loss_ = 0
    predictions = []
    true_labels =  []

    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        loss_ += loss.item()

        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

    loss_ = loss_/len(dataloader)
    print(f"Epoch {epoch + 1} - Training Loss: {loss_:.4f}")

    accuracy = accuracy_score(true_labels, predictions)
    print(f"Epoch {epoch + 1} - Training Accuracy: {accuracy:.4f}")

    lossi.append(loss_)

  0%|          | 0/1563 [00:00<?, ?it/s]

Epoch 1 - Training Loss: 0.0228
Epoch 1 - Training Accuracy: 0.9940


In [29]:
model.to("cpu")

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [30]:
torch.save(model,'model_XLNet.pt')