In [11]:
# Install required packages
!pip install torch torchvision torchaudio
!pip install transformers
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
!pip install textblob
!pip install gensim
!python -m spacy download en_core_web_sm
!pip install sentencepiece
!pip install datsets transformers[sentencepiece]



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2023-11-15 18:08:44.066785: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-15 18:08:44.066849: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-15 18:08:44.066894: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now l

In [12]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from transformers import pipeline
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [25]:
import re
import numpy as np
import pandas as pd
from textblob import Word
from nltk.corpus import stopwords

folder_path = "/content/"

train_data = pd.read_csv(folder_path+"NLP_ass_train.tsv", delimiter="\t")
test_data = pd.read_csv(folder_path+"NLP_ass_test.tsv", delimiter = "\t")
val_data = pd.read_csv(folder_path+"NLP_ass_valid.tsv", delimiter = "\t")

# Renaming the columns
data1 = pd.DataFrame({train_data.columns[0]: [train_data.columns[0]], train_data.columns[1]: [train_data.columns[1]]})
train_data = pd.concat([data1, train_data])
train_data = train_data.reset_index(drop=True)
train_data.columns = ["Text", "Label"]

data2 = pd.DataFrame({val_data.columns[0]: [val_data.columns[0]], val_data.columns[1]: [val_data.columns[1]]})
val_data = pd.concat([data2, val_data])
val_data = val_data.reset_index(drop=True)
val_data.columns = ["Text", "Label"]

data3 = pd.DataFrame({test_data.columns[0]: [test_data.columns[0]], test_data.columns[1]: [test_data.columns[1]]})
test_data = pd.concat([data3, test_data])
test_data = test_data.reset_index(drop=True)
test_data.columns = ["Text", "Label"]

In [26]:
def func(txt):
    txt = """Task: Text Classification
    Definitions:
    1. hatespeech: Any speech or text that specifically targets and attacks a person or group based on attributes such as race, religion, ethnic origin, national origin, gender, disability, sexual orientation, or gender identity.
    2. offensive: Text or speech containing abusive slurs or derogatory terms, which may not necessarily target a specific attribute but is disrespectful or inappropriate.
    3. normal: Text that is neutral, respectful, and adheres to social norms.
    Your task is to classify the following text snippet, enclosed within three backticks, as either normal, offensive, or hatespeech based on the given definitions. Please provide the classification as either normal, offensive, or hatespeech. Be precise in your classification according to the provided definitions.
    Text to classify: """ + f'{txt}'
    return txt

train_data['Prompt'] = train_data.apply(func)
val_data['Prompt'] = val_data.apply(func)
test_data['Prompt'] = test_data.apply(func)

In [27]:
# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(
            item['Prompt'],
            item['Text'],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten()
        }

# Instantiate the tokenizer and model
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create instances of the custom dataset
train_dataset = CustomDataset(train_data, tokenizer)
val_dataset = CustomDataset(val_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

# DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.dense.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Set up optimizer and training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        inputs = {key: batch[key].to(model.device) for key in batch}
        labels = torch.tensor([1] * len(batch)).to(model.device)  # Modify this based on your task
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Validation Epoch {epoch + 1}/{num_epochs}'):
            inputs = {key: batch[key].to(model.device) for key in batch}
            labels = torch.tensor([1] * len(batch)).to(model.device)  # Modify this based on your task
            outputs = model(**inputs, labels=labels)
            val_loss += outputs.loss.item()

    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {val_loss / len(val_loader)}')

In [None]:
# Save the fine-tuned model
model.save_pretrained("path/to/save/fine-tuned/model")

# Test the saved model
model = AutoModelForSequenceClassification.from_pretrained("path/to/save/fine-tuned/model")
classifier = pipeline('zero-shot-classification', model=model, tokenizer=tokenizer)
result = classifier(test_data)
print(result)
