In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from transformers import BertModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer
from sklearn.metrics import classification_report
#from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional

In [2]:
df = pd.read_csv("GPT-wiki-intro.csv")

In [3]:
human_text = df['wiki_intro']
generated_text = df['generated_intro']

In [4]:
human_text_subset = human_text[:30000]
generated_text_subset = generated_text[:30000]

In [5]:
human_text_label = [0] * 30000
human_text_label = np.array(human_text_label,dtype=int)

In [6]:
generated_text_label = [1] * 30000
generated_text_label = np.array(generated_text_label,dtype=int)

In [7]:
text_data = np.hstack((human_text_subset,generated_text_subset))
text_labels = np.hstack((human_text_label,generated_text_label))

In [8]:
text_data = text_data.reshape((60000,1))
text_labels = text_labels.reshape((60000,1))

data_and_labels = np.hstack((text_data,text_labels))
np.random.shuffle(data_and_labels)

In [9]:
shuffled_data = data_and_labels[:,0]
shuffled_labels = data_and_labels[:,1]

In [10]:
x_train, x_test, y_train, y_test = train_test_split(shuffled_data,shuffled_labels,train_size=0.7,random_state=42)

My code starts here:

In [11]:
class BertMLPClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertMLPClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)  # Add dropout layer for regularization
        self.fc1 = nn.Linear(bert_model.config.hidden_size, 256)  # Add fully connected layer
        self.fc2 = nn.Linear(256, num_classes)  # Output layer for classification

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Get the pooled output from BERT
        pooled_output = self.dropout(pooled_output)
        x = torch.relu(self.fc1(pooled_output))  # Apply ReLU activation function
        x = self.fc2(x)  # Final output logits
        return x

In [12]:
def evaluate(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

In [13]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Initialize classifier model
num_classes = 2  # Example: binary classification
classifier_model = BertMLPClassifier(bert_model, num_classes)

# Specify optimizer (Adam) and loss function (CrossEntropyLoss)
optimizer = torch.optim.Adam(classifier_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()


# `**Main Function**`

In [14]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input data
x_train_tokenized = tokenizer(list(x_train), padding=True, truncation=True, return_tensors='pt')
x_test_tokenized = tokenizer(list(x_test), padding=True, truncation=True, return_tensors='pt')

In [15]:
y_train_float = y_train.astype(np.float32)
y_test_float = y_test.astype(np.float32)

# Convert labels to tensors
y_train_tensor = torch.tensor(y_train_float)
y_test_tensor = torch.tensor(y_test_float)

In [16]:
# Create TensorDatasets
train_dataset = TensorDataset(x_train_tokenized['input_ids'], x_train_tokenized['attention_mask'], y_train_tensor)
test_dataset = TensorDataset(x_test_tokenized['input_ids'], x_test_tokenized['attention_mask'], y_test_tensor)

# Create DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [29]:
# Train the model
classifier_model.to('cuda')
classifier_model.train()
num_epochs = 10

for epoch in range(num_epochs):
    running_loss = 0.0
    for batch in train_loader:
        input_ids, attention_mask, labels = batch

        # Move tensors to the appropriate device (e.g., GPU)
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        #input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = classifier_model(input_ids, attention_mask)
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1} - Training Loss: {running_loss / len(train_loader)}")

Epoch 1 - Training Loss: 0.06629696579819215
Epoch 2 - Training Loss: 0.012409298639915287
Epoch 3 - Training Loss: 0.0067199887259302435
Epoch 4 - Training Loss: 0.0049644896053169215
Epoch 5 - Training Loss: 0.00316264938477005
Epoch 6 - Training Loss: 0.002938236287965552
Epoch 7 - Training Loss: 0.004203557468234812
Epoch 8 - Training Loss: 0.0022267006658984534
Epoch 9 - Training Loss: 0.0018690036982108933
Epoch 10 - Training Loss: 0.001029825019924575


In [30]:
# for batch in train_loader:
#     input_ids, attention_mask, labels = batch

#     # Move tensors to the appropriate device (e.g., GPU)
#     input_ids = input_ids.to('cuda')
#     attention_mask = attention_mask.to('cuda')
#     labels = labels.to('cuda')

#     # Perform operations with the batch
#     optimizer.zero_grad()
#     outputs = classifier_model(input_ids, attention_mask)
#     loss = criterion(outputs, labels)
#     loss.backward()
#     optimizer.step()
#     running_loss += loss.item()
# print(f"Epoch {epoch+1} - Training Loss: {running_loss / len(train_loader)}")

In [33]:
# Evaluate the model
classifier_model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        outputs = classifier_model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        y_true.extend(labels.cpu().tolist())
        y_pred.extend(predicted.cpu().tolist())

# Calculate evaluation metrics
accuracy, precision, recall, f1 = evaluate(y_true, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Classification report
print("Classification Report:")
print(classification_report(y_true, y_pred))

Accuracy: 0.9883888888888889
Precision: 0.9779806659505907
Recall: 0.9995608738610166
F1 Score: 0.9886530213366632
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      8891
         1.0       0.98      1.00      0.99      9109

    accuracy                           0.99     18000
   macro avg       0.99      0.99      0.99     18000
weighted avg       0.99      0.99      0.99     18000

