In [16]:
!pip install transformers -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...


Defaulting to user installation because normal site-packages is not writeable


In [17]:
import torch


In [18]:
torch.cuda.is_available()

True

In [19]:
torch.cuda.device_count()
torch.cuda.current_device()
torch.cuda.device(0)
torch.cuda.get_device_name(0)

'NVIDIA A40'

# The below code is for checking the output(xj) for 50 samples and adjusting temperature,top_k and top_p 

In [3]:
import pandas as pd
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1

file_path = 'data.csv'  # Replace this path with actual path of data
data = pd.read_csv(file_path)

texts = data['part1'].head(10).tolist() # considering only 50 samples each of 10 from each model

bert_model = pipeline('fill-mask', model='bert-base-uncased', device=device)
distilgpt2_model = pipeline('text-generation', model='distilgpt2', device=device)
gpt_neo_model = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B', device=device)
opt_model = pipeline('text-generation', model='facebook/opt-1.3b', device=device)
flan_t5_model = pipeline('text2text-generation', model='google/flan-t5-large', device=device)

def bert_completion(text):
    masked_text = text + " [MASK]."
    return bert_model(masked_text)[0]['sequence']

# removing repeated input text and clean tokens)
def clean_output(text, output):
    # Remove exact repetitions of input text
    if output.startswith(text):
        output = output[len(text):].strip()
    
    output_tokens = output.split()
    cleaned_output = " ".join(sorted(set(output_tokens), key=output_tokens.index))
        cleaned_output = cleaned_output.replace('<token>', '').strip()
    return cleaned_output

def enforce_min_length(output, min_tokens=10):
    tokens = output.split()
    if len(tokens) < min_tokens:
        output += " " + " ".join(["<token>"] * (min_tokens - len(tokens)))
    return output

def distilgpt2_completion(text):
    raw_output = distilgpt2_model(text, max_length=50, temperature=0.7, top_p=0.85)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return enforce_min_length(raw_output)

def gpt_neo_completion(text):
    raw_output = gpt_neo_model(text, max_length=50, temperature=0.5, top_k=30, top_p=0.85)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return enforce_min_length(raw_output)

def opt_completion(text):
    raw_output = opt_model(text, max_length=50, temperature=0.7, top_k=30)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return enforce_min_length(raw_output)

def flan_t5_completion(text):
    raw_output = flan_t5_model(text, max_length=50, temperature=0.7)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return enforce_min_length(raw_output)

completions = []

for text in texts:
    completions.append([text, bert_completion(text), 'BERT'])
    completions.append([text, distilgpt2_completion(text), 'DistilGPT-2'])
    completions.append([text, gpt_neo_completion(text), 'GPT-Neo'])
    completions.append([text, opt_completion(text), 'OPT'])
    completions.append([text, flan_t5_completion(text), 'Flan-T5'])

completions_df = pd.DataFrame(completions, columns=['Original Text', 'Completion', 'Model'])

output_file_path = 'output_check_enhanced.csv'  # Replace with actual path as needed
completions_df.to_csv(output_file_path, index=False)

print("Sentence completions generated and saved to:", output_file_path)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more 

Sentence completions generated and saved to: output_check_enhanced.csv


# The below code is another trail for checking output for 50 sample outputs xj and adjusting temperature,top_k and top_p

In [4]:
import pandas as pd
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1

file_path = 'data.csv'  # Replace with actual path of data
data = pd.read_csv(file_path)

texts = data['part1'].head(10).tolist()

bert_model = pipeline('fill-mask', model='bert-base-uncased', device=device)
distilgpt2_model = pipeline('text-generation', model='distilgpt2', device=device)
gpt_neo_model = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B', device=device)
opt_model = pipeline('text-generation', model='facebook/opt-1.3b', device=device)
flan_t5_model = pipeline('text2text-generation', model='google/flan-t5-large', device=device)

def bert_completion(text):
    masked_text = text + " [MASK]."
    return bert_model(masked_text)[0]['sequence']

# removing repeated input text and clean tokens
def clean_output(text, output):
    # Removing exact repetitions of input text
    if output.startswith(text):
        output = output[len(text):].strip()
    output_tokens = output.split()
    cleaned_output = " ".join(sorted(set(output_tokens), key=output_tokens.index))
    
    return cleaned_output

def enforce_min_tokens(output, min_tokens=5):
    tokens = output.split()
    if len(set(tokens)) < min_tokens:
        return "Generated text is too short or repetitive."
    return output

def distilgpt2_completion(text):
    raw_output = distilgpt2_model(text, max_length=30, temperature=0.7, top_p=0.85)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return enforce_min_tokens(raw_output)

def gpt_neo_completion(text):
    raw_output = gpt_neo_model(text, max_length=30, temperature=0.5, top_k=30, top_p=0.85)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return enforce_min_tokens(raw_output)

def opt_completion(text):
    raw_output = opt_model(text, max_length=30, temperature=0.7, top_k=30)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return enforce_min_tokens(raw_output)

def flan_t5_completion(text):
    raw_output = flan_t5_model(text, max_length=30, temperature=0.7)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return enforce_min_tokens(raw_output)

completions = []

for text in texts:
    completions.append([text, bert_completion(text), 'BERT'])
    completions.append([text, distilgpt2_completion(text), 'DistilGPT-2'])
    completions.append([text, gpt_neo_completion(text), 'GPT-Neo'])
    completions.append([text, opt_completion(text), 'OPT'])
    completions.append([text, flan_t5_completion(text), 'Flan-T5'])

completions_df = pd.DataFrame(completions, columns=['Original Text', 'Completion', 'Model'])

output_file_path = 'output_check_enhanced_finally.csv'  # Replace with actual path accprdingly
completions_df.to_csv(output_file_path, index=False)

print("Sentence completions generated and saved to:", output_file_path)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more 

Sentence completions generated and saved to: output_check_enhanced_finally.csv


# The below code is for checking output xj by one more trail for 50 samples and adjusting temperature,top_k and top_p along with performing data cleaning

In [5]:
import pandas as pd
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1

file_path = 'data.csv'  # Replace with actual path of the data
data = pd.read_csv(file_path)

texts = data['part1'].head(10).tolist()

bert_model = pipeline('fill-mask', model='bert-base-uncased', device=device)
distilgpt2_model = pipeline('text-generation', model='distilgpt2', device=device)
gpt_neo_model = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B', device=device)
opt_model = pipeline('text-generation', model='facebook/opt-1.3b', device=device)
flan_t5_model = pipeline('text2text-generation', model='google/flan-t5-large', device=device)

def bert_completion(text):
    masked_text = text + " [MASK]."
    return bert_model(masked_text)[0]['sequence']

def clean_output(text, output):
    if output.startswith(text):
        output = output[len(text):].strip()

    output_tokens = output.split()
    cleaned_output = " ".join(sorted(set(output_tokens), key=output_tokens.index))
    
    return cleaned_output

def improve_output(raw_output):
    if len(set(raw_output.split())) < 5:
        return "The generated text was too repetitive. Trying again with more context."
    return raw_output

def distilgpt2_completion(text):
    raw_output = distilgpt2_model(text, max_length=60, temperature=0.6, top_p=0.8)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return improve_output(raw_output)

def gpt_neo_completion(text):
    raw_output = gpt_neo_model(text, max_length=60, temperature=0.5, top_k=40, top_p=0.8)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return improve_output(raw_output)

def opt_completion(text):
    raw_output = opt_model(text, max_length=60, temperature=0.7, top_k=40)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return improve_output(raw_output)

def flan_t5_completion(text):
    raw_output = flan_t5_model(text, max_length=60, temperature=0.6)[0]['generated_text']
    raw_output = clean_output(text, raw_output)
    return improve_output(raw_output)

completions = []

for text in texts:
    completions.append([text, bert_completion(text), 'BERT'])
    completions.append([text, distilgpt2_completion(text), 'DistilGPT-2'])
    completions.append([text, gpt_neo_completion(text), 'GPT-Neo'])
    completions.append([text, opt_completion(text), 'OPT'])
    completions.append([text, flan_t5_completion(text), 'Flan-T5'])

completions_df = pd.DataFrame(completions, columns=['Original Text', 'Completion', 'Model'])

output_file_path = 'output_check_final000.csv'  # Replace with actual path
completions_df.to_csv(output_file_path, index=False)

print("Sentence completions generated and saved to:", output_file_path)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more 

Sentence completions generated and saved to: output_check_final000.csv


# The below code is final version check for generating 2500 texts from 5 different Large Language Models

In [4]:
import pandas as pd
from transformers import pipeline
import torch
from concurrent.futures import ThreadPoolExecutor

device = 0 if torch.cuda.is_available() else -1

file_path = 'data.csv'  # Replace with actual path of data
data = pd.read_csv(file_path)

texts = data['part1'].head(50).tolist()
=batch_size = 50

bert_model = pipeline('fill-mask', model='bert-base-uncased', device=device)
distilgpt2_model = pipeline('text-generation', model='distilgpt2', device=device)
gpt_neo_model = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B', device=device)
opt_model = pipeline('text-generation', model='facebook/opt-1.3b', device=device)
flan_t5_model = pipeline('text2text-generation', model='google/flan-t5-large', device=device)

def clean_output(text, output):
    if output.startswith(text):
        output = output[len(text):].strip()

    output_tokens = output.split()
    cleaned_output = " ".join(sorted(set(output_tokens), key=output_tokens.index))
    
    return cleaned_output

def improve_output(raw_output):
    if len(set(raw_output.split())) < 5:
        return "The generated text was too repetitive. Trying again with more context."
    return raw_output

def generate_completions_batch(text_batch):
    batch_completions = []

    for text in text_batch:
        completions = [
            [text, clean_output(text, bert_model(text + " [MASK].")[0]['sequence']), 'BERT'],
            [text, clean_output(text, distilgpt2_model(text, max_length=60, temperature=0.6, top_p=0.8)[0]['generated_text']), 'DistilGPT-2'],
            [text, clean_output(text, gpt_neo_model(text, max_length=60, temperature=0.5, top_k=40, top_p=0.8)[0]['generated_text']), 'GPT-Neo'],
            [text, clean_output(text, opt_model(text, max_length=60, temperature=0.7, top_k=40)[0]['generated_text']), 'OPT'],
            [text, clean_output(text, flan_t5_model(text, max_length=60, temperature=0.6)[0]['generated_text']), 'Flan-T5']
        ]
        batch_completions.extend(completions)
    
    return batch_completions

def process_batches(texts, batch_size):
    completions = []

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []

        for i in range(0, len(texts), batch_size):
            text_batch = texts[i:i+batch_size]
            futures.append(executor.submit(generate_completions_batch, text_batch))

        for future in futures:
            completions.extend(future.result())

    return completions

completions = process_batches(texts, batch_size)

completions_df = pd.DataFrame(completions, columns=['Original Text', 'Completion', 'Model'])

output_file_path = 'output_check_parallel_50.csv'  # Replace with the actual path to save
completions_df.to_csv(output_file_path, index=False)

print("Sentence completions generated and saved to:", output_file_path)


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

Sentence completions generated and saved to: output_check_parallel_50.csv


# The below code is final version  for generating 17,500 texts from 5 different models

In [6]:
import pandas as pd
from transformers import pipeline
import torch
from concurrent.futures import ThreadPoolExecutor

device = 0 if torch.cuda.is_available() else -1

file_path = 'data.csv'  
data = pd.read_csv(file_path)

# Taking all 3500 samples for full run of 3500 head
texts = data['part1'].head(3500).tolist()  

batch_size = 50

bert_model = pipeline('fill-mask', model='bert-base-uncased', device=device)
distilgpt2_model = pipeline('text-generation', model='distilgpt2', device=device)
gpt_neo_model = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B', device=device)
opt_model = pipeline('text-generation', model='facebook/opt-1.3b', device=device)
flan_t5_model = pipeline('text2text-generation', model='google/flan-t5-large', device=device)

def clean_output(text, output):
    if output.startswith(text):
        output = output[len(text):].strip()

    output_tokens = output.split()
    cleaned_output = " ".join(sorted(set(output_tokens), key=output_tokens.index))

    words = output.split()
    output = ' '.join([words[i] for i in range(len(words)) if i == 0 or words[i] != words[i-1]])
    
    return cleaned_output

def improve_output(raw_output):
    if len(set(raw_output.split())) < 5:  
        return "The generated text was too repetitive. Trying again with more context."
    return raw_output

def generate_completions_batch(text_batch):
    batch_completions = []

    for text in text_batch:
        completions = [
            [text, clean_output(text, bert_model(text + " [MASK].")[0]['sequence']), 'BERT'],
            [text, clean_output(text, distilgpt2_model(text, max_length=60, temperature=0.6, top_p=0.85, do_sample=True)[0]['generated_text']), 'DistilGPT-2'],
            [text, clean_output(text, gpt_neo_model(text, max_length=60, temperature=0.7, top_k=50, top_p=0.9, do_sample=True)[0]['generated_text']), 'GPT-Neo'],
            [text, clean_output(text, opt_model(text, max_length=60, temperature=0.7, top_k=50, do_sample=True)[0]['generated_text']), 'OPT'],
            [text, clean_output(text, flan_t5_model(text, max_length=60, temperature=0.7, do_sample=True)[0]['generated_text']), 'Flan-T5']
        ]
        batch_completions.extend(completions)
    
    return batch_completions

def process_batches(texts, batch_size):
    completions = []

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []

        for i in range(0, len(texts), batch_size):
            text_batch = texts[i:i+batch_size]
            futures.append(executor.submit(generate_completions_batch, text_batch))

        for future in futures:
            completions.extend(future.result())

    return completions

completions = process_batches(texts, batch_size)

completions_df = pd.DataFrame(completions, columns=['Original Text', 'Completion', 'Model'])

output_file_path = 'output_check_parallel_3500_final.csv' 
completions_df.to_csv(output_file_path, index=False)

print("Sentence completions generated and saved to:", output_file_path)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more 

Sentence completions generated and saved to: output_check_parallel_3500_final.csv


# Implementingdeep learning classsifier with bert (below)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# xi and xj concatanated and  fed to the llm as input to classify

In [10]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Modify the path accordingly
data = pd.read_csv('output_check_parallel_3500_final.csv')

data = data.dropna(subset=['Original Text', 'Completion'])

# Concatenate 'Original Text' and 'Completion' to create a single feature
data['input_output'] = data['Original Text'] + " " + data['Completion']

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['Model'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['input_output'], data['label'], test_size=0.2, random_state=42
)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the input text
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = TextDataset(train_texts.values, train_labels.values, tokenizer, max_len=128)
val_dataset = TextDataset(val_texts.values, val_labels.values, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

class LLMClassifier(nn.Module):
    def __init__(self, n_classes):
        super(LLMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        output = self.drop(pooled_output)
        return self.fc(output)

model = LLMClassifier(n_classes=len(label_encoder.classes_))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in data_loader:
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        total_loss += loss.item()
        correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)

    print(f'Training Accuracy: {train_acc:.4f}, Training Loss: {train_loss:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}, Validation Loss: {val_loss:.4f}')

torch.save(model.state_dict(), 'llm_classifier_model.pth')


Using device: cuda




Epoch 1/3
Training Accuracy: 0.7336, Training Loss: 0.7018
Validation Accuracy: 0.8510, Validation Loss: 0.4115
Epoch 2/3
Training Accuracy: 0.8861, Training Loss: 0.3330
Validation Accuracy: 0.8746, Validation Loss: 0.3530
Epoch 3/3
Training Accuracy: 0.9313, Training Loss: 0.2051
Validation Accuracy: 0.8793, Validation Loss: 0.3699


# Added xi and xj as different columns instead of concatenating , also added early stoppping for better results(implemented below)

In [11]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Adjust the path of CSV file 
data = pd.read_csv('output_check_parallel_3500_final.csv')

data = data.dropna(subset=['Original Text', 'Completion'])

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['Model'])

train_xi, val_xi, train_xj, val_xj, train_labels, val_labels = train_test_split(
    data['Original Text'], data['Completion'], data['label'], test_size=0.2, random_state=42
)

class TextDataset(Dataset):
    def __init__(self, xi_texts, xj_texts, labels, tokenizer, max_len):
        self.xi_texts = xi_texts
        self.xj_texts = xj_texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.xi_texts)

    def __getitem__(self, idx):
        xi_text = self.xi_texts[idx]
        xj_text = self.xj_texts[idx]
        label = self.labels[idx]

        # Tokenize xi and xj separately
        xi_inputs = self.tokenizer.encode_plus(
            xi_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        xj_inputs = self.tokenizer.encode_plus(
            xj_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        xi_input_ids = xi_inputs['input_ids'].squeeze()
        xi_attention_mask = xi_inputs['attention_mask'].squeeze()
        xj_input_ids = xj_inputs['input_ids'].squeeze()
        xj_attention_mask = xj_inputs['attention_mask'].squeeze()

        return {
            'xi_input_ids': xi_input_ids,
            'xi_attention_mask': xi_attention_mask,
            'xj_input_ids': xj_input_ids,
            'xj_attention_mask': xj_attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = TextDataset(train_xi.values, train_xj.values, train_labels.values, tokenizer, max_len=128)
val_dataset = TextDataset(val_xi.values, val_xj.values, val_labels.values, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

class LLMClassifier(nn.Module):
    def __init__(self, n_classes):
        super(LLMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(self.bert.config.hidden_size * 2, n_classes)  # Combining two outputs
        self.drop = nn.Dropout(p=0.3)

    def forward(self, xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask):
        xi_outputs = self.bert(input_ids=xi_input_ids, attention_mask=xi_attention_mask)
        xj_outputs = self.bert(input_ids=xj_input_ids, attention_mask=xj_attention_mask)

        combined_output = torch.cat((xi_outputs.pooler_output, xj_outputs.pooler_output), dim=1)

        output = self.drop(combined_output)
        return self.fc(output)

model = LLMClassifier(n_classes=len(label_encoder.classes_))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

early_stopping_patience = 2  
early_stopping_counter = 0
best_val_loss = float('inf')

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in data_loader:
        xi_input_ids = data['xi_input_ids'].to(device)
        xi_attention_mask = data['xi_attention_mask'].to(device)
        xj_input_ids = data['xj_input_ids'].to(device)
        xj_attention_mask = data['xj_attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
        loss = loss_fn(outputs, labels)

        total_loss += loss.item()
        correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            xi_input_ids = data['xi_input_ids'].to(device)
            xi_attention_mask = data['xi_attention_mask'].to(device)
            xj_input_ids = data['xj_input_ids'].to(device)
            xj_attention_mask = data['xj_attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

epochs = 10
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)

    print(f'Training Accuracy: {train_acc:.4f}, Training Loss: {train_loss:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}, Validation Loss: {val_loss:.4f}')

    # Early stopping logic implemeted below
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        torch.save(model.state_dict(), 'best_llm_classifier_model_xi_xj_early_stopping.pth')
        print("Model saved with improved validation loss.")
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_patience:
        print(f"Early stopping triggered. Stopping training after {epoch+1} epochs.")
        break

print(f"Best Validation Loss: {best_val_loss:.4f}")


Using device: cuda




Epoch 1/10
Training Accuracy: 0.7759, Training Loss: 0.6359
Validation Accuracy: 0.8813, Validation Loss: 0.3661
Model saved with improved validation loss.
Epoch 2/10
Training Accuracy: 0.9095, Training Loss: 0.2914
Validation Accuracy: 0.8907, Validation Loss: 0.3345
Model saved with improved validation loss.
Epoch 3/10
Training Accuracy: 0.9432, Training Loss: 0.1898
Validation Accuracy: 0.8921, Validation Loss: 0.3461
Epoch 4/10
Training Accuracy: 0.9641, Training Loss: 0.1273
Validation Accuracy: 0.8950, Validation Loss: 0.3953
Early stopping triggered. Stopping training after 4 epochs.
Best Validation Loss: 0.3345


# Learning rate with 1*10^(-5) (below)

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# adjust the path accordingly
data = pd.read_csv('output_check_parallel_3500_final.csv')

data = data.dropna(subset=['Original Text', 'Completion'])

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['Model'])

train_xi, val_xi, train_xj, val_xj, train_labels, val_labels = train_test_split(
    data['Original Text'], data['Completion'], data['label'], test_size=0.2, random_state=42
)

class TextDataset(Dataset):
    def __init__(self, xi_texts, xj_texts, labels, tokenizer, max_len):
        self.xi_texts = xi_texts
        self.xj_texts = xj_texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.xi_texts)

    def __getitem__(self, idx):
        xi_text = self.xi_texts[idx]
        xj_text = self.xj_texts[idx]
        label = self.labels[idx]

        # Tokenize xi and xj separately
        xi_inputs = self.tokenizer.encode_plus(
            xi_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        xj_inputs = self.tokenizer.encode_plus(
            xj_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        xi_input_ids = xi_inputs['input_ids'].squeeze()
        xi_attention_mask = xi_inputs['attention_mask'].squeeze()
        xj_input_ids = xj_inputs['input_ids'].squeeze()
        xj_attention_mask = xj_inputs['attention_mask'].squeeze()

        return {
            'xi_input_ids': xi_input_ids,
            'xi_attention_mask': xi_attention_mask,
            'xj_input_ids': xj_input_ids,
            'xj_attention_mask': xj_attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = TextDataset(train_xi.values, train_xj.values, train_labels.values, tokenizer, max_len=128)
val_dataset = TextDataset(val_xi.values, val_xj.values, val_labels.values, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

class LLMClassifier(nn.Module):
    def __init__(self, n_classes):
        super(LLMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(self.bert.config.hidden_size * 2, n_classes)  # Combining two outputs
        self.drop = nn.Dropout(p=0.3)

    def forward(self, xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask):
        xi_outputs = self.bert(input_ids=xi_input_ids, attention_mask=xi_attention_mask)
        xj_outputs = self.bert(input_ids=xj_input_ids, attention_mask=xj_attention_mask)

        combined_output = torch.cat((xi_outputs.pooler_output, xj_outputs.pooler_output), dim=1)

        output = self.drop(combined_output)
        return self.fc(output)

model = LLMClassifier(n_classes=len(label_encoder.classes_))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

early_stopping_patience = 2  
early_stopping_counter = 0
best_val_loss = float('inf')

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in data_loader:
        xi_input_ids = data['xi_input_ids'].to(device)
        xi_attention_mask = data['xi_attention_mask'].to(device)
        xj_input_ids = data['xj_input_ids'].to(device)
        xj_attention_mask = data['xj_attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
        loss = loss_fn(outputs, labels)

        total_loss += loss.item()
        correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            xi_input_ids = data['xi_input_ids'].to(device)
            xi_attention_mask = data['xi_attention_mask'].to(device)
            xj_input_ids = data['xj_input_ids'].to(device)
            xj_attention_mask = data['xj_attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

epochs = 10
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)

    print(f'Training Accuracy: {train_acc:.4f}, Training Loss: {train_loss:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}, Validation Loss: {val_loss:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        torch.save(model.state_dict(), 'best_llm_classifier_model_xi_xj_early_stopping.pth')
        print("Model saved with improved validation loss.")
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_patience:
        print(f"Early stopping triggered. Stopping training after {epoch+1} epochs.")
        break

print(f"Best Validation Loss: {best_val_loss:.4f}")


# using optuna for finding the best Hyper-Parameters

In [12]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import optuna

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Adjust the data path here accordingly
data = pd.read_csv('output_check_parallel_3500_final.csv')

data = data.dropna(subset=['Original Text', 'Completion'])

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['Model'])

train_xi, val_xi, train_xj, val_xj, train_labels, val_labels = train_test_split(
    data['Original Text'], data['Completion'], data['label'], test_size=0.2, random_state=42
)

class TextDataset(Dataset):
    def __init__(self, xi_texts, xj_texts, labels, tokenizer, max_len):
        self.xi_texts = xi_texts
        self.xj_texts = xj_texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.xi_texts)

    def __getitem__(self, idx):
        xi_text = self.xi_texts[idx]
        xj_text = self.xj_texts[idx]
        label = self.labels[idx]

        xi_inputs = self.tokenizer.encode_plus(
            xi_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        xj_inputs = self.tokenizer.encode_plus(
            xj_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        xi_input_ids = xi_inputs['input_ids'].squeeze()
        xi_attention_mask = xi_inputs['attention_mask'].squeeze()
        xj_input_ids = xj_inputs['input_ids'].squeeze()
        xj_attention_mask = xj_inputs['attention_mask'].squeeze()

        return {
            'xi_input_ids': xi_input_ids,
            'xi_attention_mask': xi_attention_mask,
            'xj_input_ids': xj_input_ids,
            'xj_attention_mask': xj_attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = TextDataset(train_xi.values, train_xj.values, train_labels.values, tokenizer, max_len=128)
val_dataset = TextDataset(val_xi.values, val_xj.values, val_labels.values, tokenizer, max_len=128)

class LLMClassifier(nn.Module):
    def __init__(self, n_classes, dropout):
        super(LLMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(self.bert.config.hidden_size * 2, n_classes)  # Combining two outputs
        self.drop = nn.Dropout(p=dropout)

    def forward(self, xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask):
        xi_outputs = self.bert(input_ids=xi_input_ids, attention_mask=xi_attention_mask)
        xj_outputs = self.bert(input_ids=xj_input_ids, attention_mask=xj_attention_mask)

        combined_output = torch.cat((xi_outputs.pooler_output, xj_outputs.pooler_output), dim=1)

        output = self.drop(combined_output)
        return self.fc(output)

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in data_loader:
        xi_input_ids = data['xi_input_ids'].to(device)
        xi_attention_mask = data['xi_attention_mask'].to(device)
        xj_input_ids = data['xj_input_ids'].to(device)
        xj_attention_mask = data['xj_attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
        loss = loss_fn(outputs, labels)

        total_loss += loss.item()
        correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            xi_input_ids = data['xi_input_ids'].to(device)
            xi_attention_mask = data['xi_attention_mask'].to(device)
            xj_input_ids = data['xj_input_ids'].to(device)
            xj_attention_mask = data['xj_attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def objective(trial):
    lr = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    dropout = trial.suggest_uniform('dropout', 0.1, 0.4)
    batch_size = trial.suggest_categorical('batch_size', [16,32, 64])

    model = LLMClassifier(n_classes=len(label_encoder.classes_), dropout=dropout)
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    best_val_acc = 0
    for epoch in range(3):  
        train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_acc, val_loss = eval_model(model, val_loader, criterion, device)

        best_val_acc = max(best_val_acc, val_acc)
    
    return best_val_acc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 8)

# Printing the best hyperparameters
print(f'Best hyperparameters: {study.best_params}')


[I 2024-10-05 14:07:37,694] A new study created in memory with name: no-name-659ab7b7-5d82-4bb8-975b-873e9ed4226f


Using device: cuda


  lr = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
  dropout = trial.suggest_uniform('dropout', 0.1, 0.4)
[I 2024-10-05 14:15:51,852] Trial 0 finished with value: 0.8967930029154519 and parameters: {'learning_rate': 1.615883626026264e-05, 'dropout': 0.25956591028700327, 'batch_size': 16}. Best is trial 0 with value: 0.8967930029154519.
[I 2024-10-05 14:22:01,126] Trial 1 finished with value: 0.8941690962099126 and parameters: {'learning_rate': 1.0601293430815955e-05, 'dropout': 0.194885404784585, 'batch_size': 64}. Best is trial 0 with value: 0.8967930029154519.
[I 2024-10-05 14:28:10,244] Trial 2 finished with value: 0.8988338192419825 and parameters: {'learning_rate': 1.7688518146371645e-05, 'dropout': 0.36475551072128276, 'batch_size': 64}. Best is trial 2 with value: 0.8988338192419825.
[I 2024-10-05 14:34:19,439] Trial 3 finished with value: 0.8813411078717202 and parameters: {'learning_rate': 1.3291155620702872e-05, 'dropout': 0.11857527070919213, 'batch_size': 64}. Bes

Best hyperparameters: {'learning_rate': 2.7305331653551963e-05, 'dropout': 0.31044018940585527, 'batch_size': 32}


# Finally saving below the deep learning classification model with best hyper-parameters taken from OPTUNA 

In [13]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.cuda.amp import GradScaler, autocast
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset appropriately
data = pd.read_csv('output_check_parallel_3500_final.csv')

data = data.dropna(subset=['Original Text', 'Completion'])

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['Model'])

train_xi, val_xi, train_xj, val_xj, train_labels, val_labels = train_test_split(
    data['Original Text'], data['Completion'], data['label'], test_size=0.2, random_state=42
)

class TextDataset(Dataset):
    def __init__(self, xi_texts, xj_texts, labels, tokenizer, max_len):
        self.xi_texts = xi_texts
        self.xj_texts = xj_texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.xi_texts)

    def __getitem__(self, idx):
        xi_text = self.xi_texts[idx]
        xj_text = self.xj_texts[idx]
        label = self.labels[idx]

        xi_inputs = self.tokenizer.encode_plus(xi_text, None, add_special_tokens=True, max_length=self.max_len,
                                               padding='max_length', truncation=True, return_tensors='pt')
        xj_inputs = self.tokenizer.encode_plus(xj_text, None, add_special_tokens=True, max_length=self.max_len,
                                               padding='max_length', truncation=True, return_tensors='pt')

        xi_input_ids = xi_inputs['input_ids'].squeeze()
        xi_attention_mask = xi_inputs['attention_mask'].squeeze()
        xj_input_ids = xj_inputs['input_ids'].squeeze()
        xj_attention_mask = xj_inputs['attention_mask'].squeeze()

        return {
            'xi_input_ids': xi_input_ids,
            'xi_attention_mask': xi_attention_mask,
            'xj_input_ids': xj_input_ids,
            'xj_attention_mask': xj_attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
batch_size = 32

train_dataset = TextDataset(train_xi.values, train_xj.values, train_labels.values, tokenizer, max_len=80)
val_dataset = TextDataset(val_xi.values, val_xj.values, val_labels.values, tokenizer, max_len=80)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

class LLMClassifier(nn.Module):
    def __init__(self, n_classes, dropout):
        super(LLMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc = nn.Linear(self.bert.config.hidden_size * 2, n_classes)
        self.drop = nn.Dropout(p=dropout)

    def forward(self, xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask):
        xi_outputs = self.bert(input_ids=xi_input_ids, attention_mask=xi_attention_mask)
        xj_outputs = self.bert(input_ids=xj_input_ids, attention_mask=xj_attention_mask)

        combined_output = torch.cat((xi_outputs.pooler_output, xj_outputs.pooler_output), dim=1)
        output = self.drop(combined_output)
        return self.fc(output)

model = LLMClassifier(n_classes=len(label_encoder.classes_), dropout=0.31044108940585527)
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)
model = model.to(device)

scaler = GradScaler()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2.7305331653551963e-05)

early_stopping_patience = 2  
early_stopping_counter = 0
best_val_loss = float('inf')

model_save_dir = 'saved_models'
os.makedirs(model_save_dir, exist_ok=True)

def train_epoch(model, data_loader, loss_fn, optimizer, device, scaler):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in data_loader:
        xi_input_ids = data['xi_input_ids'].to(device)
        xi_attention_mask = data['xi_attention_mask'].to(device)
        xj_input_ids = data['xj_input_ids'].to(device)
        xj_attention_mask = data['xj_attention_mask'].to(device)
        labels = data['label'].to(device)

        with autocast():  
            outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
            loss = loss_fn(outputs, labels)

        total_loss += loss.item()
        correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

        optimizer.zero_grad()
        scaler.scale(loss).backward() 
        scaler.step(optimizer)
        scaler.update()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            xi_input_ids = data['xi_input_ids'].to(device)
            xi_attention_mask = data['xi_attention_mask'].to(device)
            xj_input_ids = data['xj_input_ids'].to(device)
            xj_attention_mask = data['xj_attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

epochs = 10
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device, scaler)
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)

    print(f'Training Accuracy: {train_acc:.4f}, Training Loss: {train_loss:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}, Validation Loss: {val_loss:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        torch.save(model.state_dict(), os.path.join(model_save_dir, 'best_llm_classifier_model_xi_xj.pth'))
        print("Model saved with improved validation loss.")
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_patience:
        print(f"Early stopping triggered. Stopping training after {epoch+1} epochs.")
        break

print(f"Best Validation Loss: {best_val_loss:.4f}")


Using device: cuda
Epoch 1/10
Training Accuracy: 0.8102, Training Loss: 0.5636
Validation Accuracy: 0.8883, Validation Loss: 0.3527
Model saved with improved validation loss.
Epoch 2/10
Training Accuracy: 0.9170, Training Loss: 0.2646
Validation Accuracy: 0.8980, Validation Loss: 0.3176
Model saved with improved validation loss.
Epoch 3/10
Training Accuracy: 0.9496, Training Loss: 0.1697
Validation Accuracy: 0.8959, Validation Loss: 0.3315
Epoch 4/10
Training Accuracy: 0.9672, Training Loss: 0.1159
Validation Accuracy: 0.8878, Validation Loss: 0.4386
Early stopping triggered. Stopping training after 4 epochs.
Best Validation Loss: 0.3176


# Trying with another model Roberta for classification the tasks

In [21]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

data = pd.read_csv('output_check_parallel_3500_final.csv')

data = data.dropna(subset=['Original Text', 'Completion'])

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['Model'])

train_xi, val_xi, train_xj, val_xj, train_labels, val_labels = train_test_split(
    data['Original Text'], data['Completion'], data['label'], test_size=0.2, random_state=42
)

class TextDataset(Dataset):
    def __init__(self, xi_texts, xj_texts, labels, tokenizer, max_len):
        self.xi_texts = xi_texts
        self.xj_texts = xj_texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.xi_texts)

    def __getitem__(self, idx):
        xi_text = self.xi_texts[idx]
        xj_text = self.xj_texts[idx]
        label = self.labels[idx]

        xi_inputs = self.tokenizer.encode_plus(
            xi_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        xj_inputs = self.tokenizer.encode_plus(
            xj_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        xi_input_ids = xi_inputs['input_ids'].squeeze()
        xi_attention_mask = xi_inputs['attention_mask'].squeeze()
        xj_input_ids = xj_inputs['input_ids'].squeeze()
        xj_attention_mask = xj_inputs['attention_mask'].squeeze()

        return {
            'xi_input_ids': xi_input_ids,
            'xi_attention_mask': xi_attention_mask,
            'xj_input_ids': xj_input_ids,
            'xj_attention_mask': xj_attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_dataset = TextDataset(train_xi.values, train_xj.values, train_labels.values, tokenizer, max_len=128)
val_dataset = TextDataset(val_xi.values, val_xj.values, val_labels.values, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

class RoBERTaClassifier(nn.Module):
    def __init__(self, n_classes):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.fc = nn.Linear(self.roberta.config.hidden_size * 2, n_classes)  # Combining two outputs
        self.drop = nn.Dropout(p=0.3)

    def forward(self, xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask):
        xi_outputs = self.roberta(input_ids=xi_input_ids, attention_mask=xi_attention_mask)
        xj_outputs = self.roberta(input_ids=xj_input_ids, attention_mask=xj_attention_mask)

        # Concatenate the [CLS] embeddings from xi and xj
        combined_output = torch.cat((xi_outputs.pooler_output, xj_outputs.pooler_output), dim=1)

        output = self.drop(combined_output)
        return self.fc(output)

model = RoBERTaClassifier(n_classes=len(label_encoder.classes_))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

early_stopping_patience = 2  
early_stopping_counter = 0
best_val_loss = float('inf')

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for data in data_loader:
        xi_input_ids = data['xi_input_ids'].to(device)
        xi_attention_mask = data['xi_attention_mask'].to(device)
        xj_input_ids = data['xj_input_ids'].to(device)
        xj_attention_mask = data['xj_attention_mask'].to(device)
        labels = data['label'].to(device)

        outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
        loss = loss_fn(outputs, labels)

        total_loss += loss.item()
        correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for data in data_loader:
            xi_input_ids = data['xi_input_ids'].to(device)
            xi_attention_mask = data['xi_attention_mask'].to(device)
            xj_input_ids = data['xj_input_ids'].to(device)
            xj_attention_mask = data['xj_attention_mask'].to(device)
            labels = data['label'].to(device)

            outputs = model(xi_input_ids, xi_attention_mask, xj_input_ids, xj_attention_mask)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            correct_predictions += (torch.argmax(outputs, dim=1) == labels).sum().item()

    return correct_predictions / len(data_loader.dataset), total_loss / len(data_loader)

epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    train_acc, train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)

    print(f'Training Accuracy: {train_acc:.4f}, Training Loss: {train_loss:.4f}')
    print(f'Validation Accuracy: {val_acc:.4f}, Validation Loss: {val_loss:.4f}')

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stopping_counter = 0
        torch.save(model.state_dict(), 'best_roberta_classifier_model_xi_xj.pth')
        print("Model saved with improved validation loss.")
    else:
        early_stopping_counter += 1

    if early_stopping_counter >= early_stopping_patience:
        print(f"Early stopping triggered. Stopping training after {epoch+1} epochs.")
        break

print(f"Best Validation Loss: {best_val_loss:.4f}")


Using device: cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Training Accuracy: 0.7989, Training Loss: 0.5671
Validation Accuracy: 0.8749, Validation Loss: 0.3676
Model saved with improved validation loss.
Epoch 2/3
Training Accuracy: 0.9060, Training Loss: 0.2933
Validation Accuracy: 0.8901, Validation Loss: 0.3236


OSError: [Errno 122] Disk quota exceeded: 'best_roberta_classifier_model_xi_xj.pth'

# We have saved model of roberta above when there is a complete run on epochs, and as we see both (Roberta and Bert) are giving almost the same accuracy we are moving ahead for testing with bert classifier as much experimentation was done on bert as shown till now.

# Preparing the unseen testing data of 750 for generating classification report

In [9]:
import pandas as pd
from transformers import pipeline
import torch
from concurrent.futures import ThreadPoolExecutor

device = 0 if torch.cuda.is_available() else -1

file_path = 'data.csv'  # Replace with actual path
data = pd.read_csv(file_path)

# Take all 3500 samples for full run
texts = data['part1'].tail(150).tolist()  # Adjust to full dataset later

# Batch size for processing
batch_size = 50

# Initialize the models for sentence completion (using Hugging Face pipelines)
bert_model = pipeline('fill-mask', model='bert-base-uncased', device=device)
distilgpt2_model = pipeline('text-generation', model='distilgpt2', device=device)
gpt_neo_model = pipeline('text-generation', model='EleutherAI/gpt-neo-1.3B', device=device)
opt_model = pipeline('text-generation', model='facebook/opt-1.3b', device=device)
flan_t5_model = pipeline('text2text-generation', model='google/flan-t5-large', device=device)

# Function to clean output (remove repeated input text and clean tokens)
def clean_output(text, output):
    # Remove exact repetitions of input text
    if output.startswith(text):
        output = output[len(text):].strip()

    # Split tokens, remove duplicates, and join again
    output_tokens = output.split()
    cleaned_output = " ".join(sorted(set(output_tokens), key=output_tokens.index))

    # Handle excessive repetitions (e.g., repeating words)
    words = output.split()
    output = ' '.join([words[i] for i in range(len(words)) if i == 0 or words[i] != words[i-1]])
    
    return cleaned_output

# Function to adjust the sampling and context for better output
def improve_output(raw_output):
    if len(set(raw_output.split())) < 5:  # Check for repetitive tokens
        return "The generated text was too repetitive. Trying again with more context."
    return raw_output

# Function to get completions from different models (batch processing)
def generate_completions_batch(text_batch):
    batch_completions = []

    # Process each text in batch
    for text in text_batch:
        # Get completions from all models and clean the output
        completions = [
            [text, clean_output(text, bert_model(text + " [MASK].")[0]['sequence']), 'BERT'],
            [text, clean_output(text, distilgpt2_model(text, max_length=60, temperature=0.6, top_p=0.85, do_sample=True)[0]['generated_text']), 'DistilGPT-2'],
            [text, clean_output(text, gpt_neo_model(text, max_length=60, temperature=0.7, top_k=50, top_p=0.9, do_sample=True)[0]['generated_text']), 'GPT-Neo'],
            [text, clean_output(text, opt_model(text, max_length=60, temperature=0.7, top_k=50, do_sample=True)[0]['generated_text']), 'OPT'],
            [text, clean_output(text, flan_t5_model(text, max_length=60, temperature=0.7, do_sample=True)[0]['generated_text']), 'Flan-T5']
        ]
        batch_completions.extend(completions)
    
    return batch_completions

# Function to process the full dataset in batches using parallel processing
def process_batches(texts, batch_size):
    completions = []

    # Define a thread pool for parallel batch processing
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = []

        # Create batches and submit them for parallel execution
        for i in range(0, len(texts), batch_size):
            text_batch = texts[i:i+batch_size]
            futures.append(executor.submit(generate_completions_batch, text_batch))

        # Collect the results from all the futures
        for future in futures:
            completions.extend(future.result())

    return completions

# Process the data in batches
completions = process_batches(texts, batch_size)

# Convert the completions into a DataFrame
completions_df = pd.DataFrame(completions, columns=['Original Text', 'Completion', 'Model'])

# Save the result to a CSV file
output_file_path = 'output_check_parallel_750_final.csv'  # Replace with actual path
completions_df.to_csv(output_file_path, index=False)

print("Sentence completions generated and saved to:", output_file_path)


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

Sentence completions generated and saved to: output_check_parallel_750_final.csv
