In [None]:
import json
import pandas as pd
import numpy as np
import nltk
from nltk import pos_tag, word_tokenize, sent_tokenize
import re
import concurrent.futures
import threading
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.optim import Adam
import torch.nn.functional as F

pip install transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')

In [None]:
with open('/hellaswag_train.jsonl', 'r') as file:
    # Read and parse each line as JSON
    training_data = [json.loads(line) for line in file]

In [None]:
# Extracting required data from dataset
required_data = []
data_key = 'ctx_a'

for curr_data in training_data:
  required_data.append(curr_data[data_key])

In [None]:
# Data preprocessing: split the sentence at the verb
def split_at_verb(sentence):
    """
    Function to split a sentence at the first occurrence of a verb.
    Returns the part before the verb and the part starting from the verb.
    """
    words = word_tokenize(sentence)
    tagged_words = pos_tag(words)

    for i, (word, pos) in enumerate(tagged_words):
        # 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ' are POS tags for different verb forms
        if pos.startswith('VB'):
            # Split the sentence at the verb
            before_verb = ' '.join(words[:i + 1])
            from_verb = ' '.join(words[i + 1:])
            return before_verb, from_verb

    # If no verb is found, return the full sentence
    return sentence, ""

def preprocess_data(training_data):
    """
    Preprocesses the input text by splitting into sentences
    and splitting each sentence at the first verb.
    """
    processed_training_data = []
    for text in training_data:
      sentences = sent_tokenize(text)

      for sentence in sentences:
          before_verb, from_verb = split_at_verb(sentence)
          processed_training_data.append({
              'before_verb': before_verb,
              'from_verb': from_verb
          })

    return processed_training_data

processed_data = preprocess_data(required_data)

In [2]:
# Data preprocessing: Remove any full stops, duplicates and restrict sentence length.
def preprocess_sentences(sentences):
    processed_sentences = []
    seen_sentences = set()

    for sentence in sentences:
        # Remove leading/trailing spaces
        sentence = sentence.strip()

        # Remove full stop at the end of the sentence
        sentence = re.sub(r'\.$', '', sentence)

        # Split the sentence into words and check if it has at least 4 words
        words = sentence.split()

        if 5 <= len(words) and len(words) <= 12:
            # Join words back to form the sentence (without extra spaces)
            cleaned_sentence = ' '.join(words)

            # Add the sentence only if it's not a duplicate
            if cleaned_sentence not in seen_sentences:
                processed_sentences.append(cleaned_sentence)
                seen_sentences.add(cleaned_sentence)

    return processed_sentences

before_verb_sentences= []

for data in processed_data:
  before_verb_sentences.append(data['before_verb'])

# Preprocess the sentences
cleaned_sentences = preprocess_sentences(before_verb_sentences)

In [None]:
# Data preprocessing: remove sentences with any characters other than alphabets
def filter_sentences(sentences):
    # Using list comprehension and regex to match sentences with only alphabetic characters and spaces
    filtered_sentences = [sentence for sentence in sentences if re.match(r'^[A-Za-z\s]+$', sentence)]
    return filtered_sentences

# Apply the filter
cleaned_sentences = filter_sentences(cleaned_sentences)
cleaned_sentences = cleaned_sentences[:3500]

In [None]:
# Final GPT-2 Model
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# GPT-2 doesn't have a padding token, so we assign it manually
tokenizer.pad_token = tokenizer.eos_token

completed_texts_gpt2 = []

# Define the text generation function
def generate_text(input_text):
    # Encode the input, adding padding if necessary
    inputs = tokenizer.encode(input_text, return_tensors='pt', padding=True)

    # Create attention mask
    attention_mask = (inputs != tokenizer.pad_token_id).long()

    # Generate text completion with stopping at EOS token
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=30,  
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,  # Lower temperature for more coherent output
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,  # Stop at the end-of-sequence token
    )

    # Decode and return the output text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    completed_texts_gpt2.append({input_text : generated_text})
    return generated_text

# Thread worker function
def run_model_in_thread(input_sentence):
    generate_text(input_sentence)

# Example sentences for multi-threaded execution
input_sentences = cleaned_sentences

# Function to run up to 10 threads concurrently
def run_with_limited_threads(sentences, max_threads=10):
    threads = []

    # Process sentences in batches of max_threads
    for i in range(0, len(sentences), max_threads):
        batch = sentences[i:i + max_threads]

        # Create threads for the current batch
        for sentence in batch:
            thread = threading.Thread(target=run_model_in_thread, args=(sentence,))
            threads.append(thread)

        # Start the threads in the current batch
        for thread in threads:
            thread.start()

        # Wait for the threads in the current batch to complete
        for thread in threads:
            # print("thread joined")
            thread.join()

        # Clear the thread list for the next batch
        threads.clear()

# Run the model using 10 threads at a time
run_with_limited_threads(input_sentences, max_threads=10)

print("All threads completed execution.")

with open('gpt2-completed-texts.txt', 'w') as file:
    for sentence in completed_texts_gpt2:
        file.write(str(sentence) + '\n')

In [None]:
# Final BLOOM model
# Load the BLOOM model and tokenizer
model_name = "bigscience/bloom-560m" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

completed_text_bloom = []

def complete_sentence(input_sentence, max_length=30):
    # Tokenize the input sentence
    input_ids = tokenizer.encode(input_sentence, return_tensors='pt')

    # Generate text completion using the model
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, do_sample=True, temperature=0.7)

    # Decode the generated text
    completed_text = tokenizer.decode(output[0], skip_special_tokens=True)
    completed_text_bloom.append({input_sentence : completed_text})
    return completed_text

# Thread worker function
def run_model_in_thread(input_sentence):
    complete_sentence(input_sentence)

# Example sentences for multi-threaded execution
input_sentences = cleaned_sentences

# Function to run up to 10 threads concurrently
def run_with_limited_threads(sentences, max_threads=10):
    threads = []

    # Process sentences in batches of max_threads
    for i in range(0, len(sentences), max_threads):
        batch = sentences[i:i + max_threads]

        # Create threads for the current batch
        for sentence in batch:
            thread = threading.Thread(target=run_model_in_thread, args=(sentence,))
            threads.append(thread)

        # Start the threads in the current batch
        for thread in threads:
            thread.start()

        # Wait for the threads in the current batch to complete
        for thread in threads:
            # print("thread joined")
            thread.join()

        # Clear the thread list for the next batch
        threads.clear()

# Run the model using 10 threads at a time
run_with_limited_threads(input_sentences, max_threads=10)

print("All threads completed execution.")

with open('bloom-completed-texts.txt', 'w') as file:
    for sentence in completed_text_bloom:
        file.write(str(sentence) + '\n')

In [None]:
# Final T5 Model
# Load FLAN-T5 large model and tokenizer
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

completed_texts_flan = []

def complete_sentence_flan(prompt, max_length=30, num_return_sequences=1):
    # Tokenize input text and create attention mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Generate output using the model, passing attention_mask
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Pass the attention mask explicitly
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,      # Enable sampling for diverse outputs
        top_k=50,            # Top-k sampling
        top_p=0.95,          # Nucleus sampling
        temperature=0.7,     # Control creativity
        pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to eos_token_id
    )
    
    # Decode the generated text into readable sentences
    response = [tokenizer.decode(g, skip_special_tokens=True) for g in output]
    appened = {prompt : prompt + " " + str(response[0])}
    # print(appened)
    completed_texts_flan.append(appened)
    return response

# Thread worker function
def run_model_in_thread(input_sentence):
    complete_sentence_flan(input_sentence, max_length=30, num_return_sequences=1)

# Example sentences for multi-threaded execution
input_sentences = cleaned_sentences


# Function to run up to 10 threads concurrently
def run_with_limited_threads(sentences, max_threads=10):
    threads = []

    # Process sentences in batches of max_threads
    for i in range(0, len(sentences), max_threads):
        batch = sentences[i:i + max_threads]

        # Create threads for the current batch
        for sentence in batch:
            thread = threading.Thread(target=run_model_in_thread, args=(sentence,))
            threads.append(thread)

        # Start the threads in the current batch
        for thread in threads:
            thread.start()

        # Wait for the threads in the current batch to complete
        for thread in threads:
            # print("thread joined")
            thread.join()

        # Clear the thread list for the next batch
        threads.clear()

# Run the model using 10 threads at a time
run_with_limited_threads(input_sentences, max_threads=10)

with open('flan-completed-texts.txt', 'w') as file:
    for sentence in completed_texts_flan:
        file.write(str(sentence) + '\n')

In [None]:
# Final Neo GPT Model
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the padding token to be the same as the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

completed_texts_gptneo = []

def complete_sentence_gpt_neo(prompt, max_length=50, num_return_sequences=1):
    # Tokenize input text and create attention mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # Generate output using the model
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Pass the attention mask explicitly
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,       # Enable sampling for diverse outputs
        top_k=50,             # Top-k sampling
        top_p=0.95,           # Nucleus sampling for more balanced output
        temperature=0.7,      # Control randomness
        pad_token_id=tokenizer.pad_token_id  # Set pad_token_id to the newly assigned pad_token
    )

    # Decode the generated text into readable sentences
    response = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    completed_texts_gptneo.append({prompt : response[0]})
    return response


# Thread worker function
def run_model_in_thread(input_sentence):
    complete_sentence_gpt_neo(input_sentence, max_length=30, num_return_sequences=1)

# Example sentences for multi-threaded execution
input_sentences = cleaned_sentences

# Function to run up to 10 threads concurrently
def run_with_limited_threads(sentences, max_threads=10):
    threads = []

    # Process sentences in batches of max_threads
    for i in range(0, len(sentences), max_threads):
        batch = sentences[i:i + max_threads]

        # Create threads for the current batch
        for sentence in batch:
            thread = threading.Thread(target=run_model_in_thread, args=(sentence,))
            threads.append(thread)

        # Start the threads in the current batch
        for thread in threads:
            thread.start()

        # Wait for the threads in the current batch to complete
        for thread in threads:
            # print("thread joined")
            thread.join()

        # Clear the thread list for the next batch
        threads.clear()

# Run the model using 10 threads at a time
run_with_limited_threads(input_sentences, max_threads=10)

with open('gptneo-completed-texts.txt', 'w') as file:
    for sentence in completed_texts_gptneo:
        file.write(str(sentence) + '\n')

In [None]:
# DialoGPT Model
model_name = "microsoft/DialoGPT-medium"  
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set padding token to EOS token
tokenizer.pad_token = tokenizer.eos_token

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure you use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

completed_texts_dialogpt = []

def complete_sentence_dialogpt(prompt, max_length=50, num_return_sequences=1):
    # Tokenize input text and create attention mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    # Generate output using the model
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        do_sample=True,       # Enable sampling for diverse outputs
        top_k=50,             # Top-k sampling
        top_p=0.95,           # Nucleus sampling for more balanced output
        temperature=0.7,      # Control randomness
        pad_token_id=tokenizer.pad_token_id  # Set pad_token_id to the newly assigned pad_token (which is EOS here)
    )

    # Decode the generated text into readable sentences
    response = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    completed_texts_dialogpt.append({prompt : response[0]})
    return response


# Thread worker function
def run_model_in_thread(input_sentence):
    complete_sentence_dialogpt(input_sentence, max_length=30, num_return_sequences=1)

input_sentences = cleaned_sentences


# Function to run up to 10 threads concurrently
def run_with_limited_threads(sentences, max_threads=10):
    threads = []

    # Process sentences in batches of max_threads
    for i in range(0, len(sentences), max_threads):
        batch = sentences[i:i + max_threads]

        # Create threads for the current batch
        for sentence in batch:
            thread = threading.Thread(target=run_model_in_thread, args=(sentence,))
            threads.append(thread)

        # Start the threads in the current batch
        for thread in threads:
            thread.start()

        # Wait for the threads in the current batch to complete
        for thread in threads:
            thread.join()

        # Clear the thread list for the next batch
        threads.clear()

# Run the model using 10 threads at a time
run_with_limited_threads(input_sentences, max_threads=10)

with open('dialogpt-completed-texts.txt', 'w') as file:
    for sentence in completed_texts_dialogpt:
        file.write(str(sentence) + '\n')

In [None]:
# Storing the data from all the models in a dataframe
# Paths to all data files
file_paths = ['/gpt2-completed-texts.txt', '/opt-completed-texts.txt', '/bloom-completed-texts.txt', '/flan-completed-texts.txt', '/gptneo-completed-texts.txt', '/dialogpt-completed-texts.txt']

# Store all the data
data_list = []

# Open the file and read each line
for file_path in file_paths:
  with open(file_path, 'r') as file:
      for line in file:
          
          # Create a dictionary for each line by splitting key-value pairs
          key, value = line.split(':', 1)

          if 'gpt2' in file_path:
            model_name = 'GPT-2'
          elif 'opt' in file_path:
            model_name = 'OPT'
          elif 'bloom' in file_path:
            model_name = 'BLOOM'
          elif 'flan' in file_path:
            model_name = 'Flan-T5'
          elif 'gptneo' in file_path:
            model_name = 'GPT-Neo'
          elif 'dialogpt' in file_path:
            model_name = 'DialoGPT'

          key = key.replace('{', '').replace('}', '').replace('"', '').replace("'", '')
          value = value.replace('{', '').replace('}', '').replace('"', '').replace("'", '')
          value = value.replace(key, '')

          if len(value) > 0:
            # Append the dictionary to the data list
            data_list.append([key, value, model_name])

# Convert the list of dictionaries into a DataFrame
df = pd.DataFrame(data_list, columns=['original text', 'completed text', 'model'])
columns_to_export = ['original text', 'completed text', 'model']

# Export the DataFrame to a CSV file, including only the specified columns
df.to_csv('final_completed_text.csv', index=False)



In [10]:
# Train and testing using Roberta classifier
#INCREASED THE EPOCHS AND ADD EARLY STOPPING:

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv')
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)  # Adjust 'num_labels' to match the number of unique LLMs

# 5. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 6. Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)

# 7. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Early Stopping Class
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_loss = np.inf
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# Initialize EarlyStopping
early_stopping = EarlyStopping(patience=3, delta=0.001)

def train_model(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        # Calculate accuracy
        _, predictions = torch.max(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 8. Training and Evaluation
epochs = 10  # Increased epochs

for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

    # Early stopping check
    early_stopping(val_loss)

    # If early stopping is triggered, stop training
    if early_stopping.early_stop:
        print("Early stopping triggered. Stopping training.")
        break


In [None]:
#Implementing dropouts

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv') 
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Custom Model with Dropout Layers
class CustomRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(CustomRobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.3)  # Dropout layer with a 30% dropout rate
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Get the pooled output from the Roberta model
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        logits = self.classifier(pooled_output)  # Classification layer
        return logits

# Initialize the custom model with dropout layers
num_labels = 6  # Adjust 'num_labels' to match the number of unique labels in your dataset
model = CustomRobertaClassifier(num_labels)

# 5. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 6. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 7. Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# 8. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_model(model, data_loader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(logits, labels)

        # Calculate accuracy
        _, predictions = torch.max(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 9. Training and Evaluation
epochs = 5
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, loss_fn)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader, loss_fn)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


In [None]:
# Changing learning rate
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv') 
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)  # Adjust 'num_labels' to match the number of unique LLMs

# 5. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 6. Optimizer, Learning Rate, and Scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)  # Changed learning rate to 1e-5

# Total steps for learning rate scheduler
total_steps = len(train_loader) * 5  # 5 is the number of epochs

# Scheduler to gradually decrease the learning rate
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # No warmup
    num_training_steps=total_steps
)

# Loss function
loss_fn = torch.nn.CrossEntropyLoss()

# 7. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_model(model, data_loader, optimizer, scheduler, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        _, predictions = torch.max(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()  # Adjust the learning rate after each batch

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 8. Training and Evaluation
epochs = 5
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, scheduler, loss_fn)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader, loss_fn)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


In [None]:
# Changing batch size

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv')
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)  # Adjust 'num_labels' to match the number of unique LLMs

# 5. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

# 6. Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 7. Variable Batch Sizes
train_batch_size = 8   # Smaller batch size for training for more gradient updates
valid_batch_size = 32  # Larger batch size for validation to speed up the process

train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=valid_batch_size, shuffle=False)

# Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_model(model, data_loader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        # Calculate accuracy
        _, predictions = torch.max(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# Training and Evaluation Loop
epochs = 5
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, loss_fn)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader, loss_fn)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


In [None]:
# Implemented gradient clipping

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv')
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)  # Adjust 'num_labels' to match the number of unique LLMs

# 5. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 6. Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 7. Gradient clipping threshold
max_grad_norm = 1.0  # Max norm for gradient clipping

# 8. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_model(model, data_loader, optimizer, loss_fn, max_grad_norm):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        _, predictions = torch.max(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        loss.backward()

        # Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 9. Training and Evaluation
epochs = 5
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, loss_fn, max_grad_norm)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader, loss_fn)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


In [None]:
#softmax activation function

#BASE MODEL

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv')
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=6)  # Adjust 'num_labels' to match the number of unique LLMs

# 5. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 6. Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)

# 7. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_model(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        # Apply softmax to get probabilities
        probs = F.softmax(logits, dim=1)
        _, predictions = torch.max(probs, dim=1)

        # Calculate accuracy
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=1)
            _, predictions = torch.max(probs, dim=1)

            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 8. Training and Evaluation
epochs = 5
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


In [None]:
#relu

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv') 
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Custom Model with ReLU Activation
class CustomRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(CustomRobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.3)  # Add dropout to prevent overfitting
        self.relu = nn.ReLU()  # ReLU activation function
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Use the pooled output from Roberta
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        pooled_output = self.relu(pooled_output)  # Apply ReLU activation
        logits = self.classifier(pooled_output)  # Final classification layer
        return logits

# Initialize the custom model with ReLU
num_labels = 6  # Adjust 'num_labels' to match the number of unique labels in your dataset
model = CustomRobertaClassifier(num_labels)

# 5. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 6. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 7. Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 8. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_model(model, data_loader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(logits, labels)

        _, predictions = torch.max(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 9. Training and Evaluation
epochs = 5
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, loss_fn)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader, loss_fn)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


In [None]:
# Sigmoid activation function

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv')  
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)  # Use long for classification
        }

# 4. Custom Model (No Sigmoid/Softmax needed)
class CustomRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(CustomRobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.3)  # Add dropout to prevent overfitting
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)  # Final classification layer

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Use the pooled output from Roberta
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        logits = self.classifier(pooled_output)  # Final classification layer
        return logits  # Return logits directly, no softmax/sigmoid needed

# Initialize the custom model
num_labels = 6  # Adjust 'num_labels' to match the number of unique labels in your dataset
model = CustomRobertaClassifier(num_labels)

# 5. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 6. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 7. Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multi-class classification

# 8. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_model(model, data_loader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(logits, labels)

        _, predictions = torch.max(logits, dim=1)  # Get the index of the max log-probability
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 9. Training and Evaluation
epochs = 5
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, loss_fn)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader, loss_fn)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


In [None]:
# Leaky relu

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv')
print(df.columns)  # Print the columns to ensure correct column names

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])  # Create a new 'label' column
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))  # Check the label mapping
print(df.head())  # Print the first few rows to ensure 'label' column exists

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Custom Model with ReLU Activation
class CustomRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(CustomRobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.3)  # Dropout layer to prevent overfitting
        self.relu = nn.ReLU()  # ReLU activation function
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Use the pooled output from Roberta
        pooled_output = self.dropout(pooled_output)  # Apply dropout
        pooled_output = self.relu(pooled_output)  # Apply ReLU activation
        logits = self.classifier(pooled_output)  # Final classification layer
        return logits

# Initialize the custom model with ReLU
num_labels = 6  # Adjust 'num_labels' to match the number of unique labels in your dataset
model = CustomRobertaClassifier(num_labels)

# 5. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 6. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# 7. Optimizer and Loss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# 8. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train_model(model, data_loader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(logits, labels)

        _, predictions = torch.max(logits, dim=1)
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)

            _, predictions = torch.max(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 9. Training and Evaluation
epochs = 5
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, loss_fn)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader, loss_fn)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


In [None]:
# FINAL OPTIMISED MODEL

# 1. Load the dataset and check the column names
df = pd.read_csv('/final_completed_text.csv')
print(df.columns)

# 2. Apply LabelEncoder to the 'model' column to generate numerical labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['model'])
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
print(df.head())

# 3. Dataset and Preprocessing

class LLMClassifierDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.sentences_xi = df['original text'].values
        self.sentences_xj = df['completed text'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences_xi)

    def __getitem__(self, idx):
        xi = str(self.sentences_xi[idx])
        xj = str(self.sentences_xj[idx])
        label = self.labels[idx]

        # Encode the sentence pairs using the Roberta tokenizer
        inputs = self.tokenizer.encode_plus(
            xi,
            xj,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 4. Custom Model with Activation Functions and Dropout
class CustomRobertaClassifier(nn.Module):
    def __init__(self, num_labels):
        super(CustomRobertaClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.3)  # Dropout to prevent overfitting
        self.relu = nn.ReLU()  # ReLU activation
        self.leaky_relu = nn.LeakyReLU()  # Leaky ReLU activation
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Pooled output
        pooled_output = self.dropout(pooled_output)  # Dropout for regularization
        pooled_output = self.leaky_relu(pooled_output)  # Leaky ReLU for non-linearity
        logits = self.classifier(pooled_output)  # Final linear layer
        return logits

# 5. Tokenizer and Model Initialization
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
num_labels = 6  # Adjust based on your dataset
model = CustomRobertaClassifier(num_labels=num_labels)

# 6. Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = LLMClassifierDataset(train_df, tokenizer, max_len=128)
test_dataset = LLMClassifierDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)  # Optimized batch size
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 7. Optimizer, Learning Rate, and Scheduler
optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)  # Learning rate and L2 regularization
total_steps = len(train_loader) * 10  # 10 epochs for increased training

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

# 8. Early Stopping Class
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_loss = np.inf
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

early_stopping = EarlyStopping(patience=3, delta=0.001)

# 9. Gradient Clipping
max_grad_norm = 1.0

# 10. Training Loop

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

loss_fn = nn.CrossEntropyLoss()

def train_model(model, data_loader, optimizer, scheduler, loss_fn, max_grad_norm):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(logits, labels)

        _, predictions = torch.max(F.softmax(logits, dim=1), dim=1)  # Use softmax
        correct_predictions += torch.sum(predictions == labels)
        total += labels.size(0)

        total_loss += loss.item()

        loss.backward()

        # Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, loss_fn):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(logits, labels)

            _, predictions = torch.max(F.softmax(logits, dim=1), dim=1)
            correct_predictions += torch.sum(predictions == labels)
            total += labels.size(0)

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / total
    return avg_loss, accuracy.item()

# 11. Training and Evaluation
epochs = 10  # Increased epochs
for epoch in range(epochs):
    # Training phase
    train_loss, train_accuracy = train_model(model, train_loader, optimizer, scheduler, loss_fn, max_grad_norm)

    # Validation phase
    val_loss, val_accuracy = eval_model(model, test_loader, loss_fn)

    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

    # Check early stopping
    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping triggered.")
        break