In [1]:
import pandas as pd
import torch
from tqdm import tqdm
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import random_split
from transformers import BertTokenizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data/AI_Human.csv')
df.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [3]:
# Assuming df is your original dataframe
df_zero = df[df['generated'] == 0]
df_one = df[df['generated'] == 1]

# Sample 5000 rows from each dataframe
df_zero_sampled = df_zero.sample(5000, random_state=1)
df_one_sampled = df_one.sample(5000, random_state=1)

# Concatenate the two dataframes
df = pd.concat([df_zero_sampled, df_one_sampled])
df.reset_index(inplace=True)

In [4]:
# Determining the max length (in words) of rows of the data
maxlen = df['text'].apply(lambda x: len(x.split())).max()
print(f'Max length: {maxlen}')

Max length: 1642


In [5]:
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe.generated
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        # Calculate the length of the sequence before padding
        text_length = len([token for token in ids if token != self.tokenizer.pad_token_id])

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float),
            'text_lengths': torch.tensor(text_length, dtype=torch.long)
        }

In [6]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create the Dataset
dataset = TextDataset(df, tokenizer, max_len=maxlen)

In [7]:
def create_data_loaders(dataset, train_prop=0.8, val_prop=0.1, test_prop=0.1, batch_size=128):
    train_len = int(train_prop * len(dataset))
    val_len = int(val_prop * len(dataset))
    test_len = int(test_prop * len(dataset))

    train_set, val_set, test_set = random_split(dataset, [train_len, val_len, test_len])

    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [8]:
train_loader, val_loader, test_loader = create_data_loaders(dataset)

In [9]:
class LSTMClassifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim, dimension=32):
        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=False)
        self.drop = nn.Dropout(p=0.5)

        # self.fc = nn.Linear(2*dimension, 1)
        self.fc = nn.Linear(dimension, 1)

    def forward(self, text, text_len):

        text_emb = self.embedding(text)

        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        # out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        # out_reverse = output[:, 0, self.dimension:]
        # out_reduced = torch.cat((out_forward, out_reverse), 1)
        # text_fea = self.drop(out_reduced)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        text_fea = self.drop(out_forward)

        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)

        return text_out

In [22]:
# Define hyperparameters
vocab_size = len(tokenizer.vocab)
embedding_dim = 100

print(f'Vocab size: {vocab_size}')

# Create the model
model = LSTMClassifier(vocab_size, embedding_dim)

Vocab size: 30522


In [23]:
# Check if CUDA is available and set device to GPU if it is, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [24]:
# Defining loss function
criterion = nn.BCELoss()

# Defining the optimizer
optimizer = torch.optim.Adam(model.parameters())

# Moving the model and loss function to same device
model = model.to(device)
criterion = criterion.to(device)

In [None]:
# Training

epochs = 10

for epoch in range(epochs):
    model.train()

    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch')

    for batch in progress_bar:
        # Zero the gradients
        optimizer.zero_grad()

        # Sort the sequences by length in descending order
        text_lengths, sorted_idx = batch['text_lengths'].sort(descending=True)
        ids = batch['ids'][sorted_idx]
        targets = batch['targets'][sorted_idx]

        ids = ids.to(device)
        text_lengths = text_lengths.to(device)
        targets = targets.to(device)

        # Forward pass
        predictions = model(ids, text_lengths)
        loss = criterion(predictions, targets)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item())})

    # Validation loop
    model.eval()
    with torch.no_grad():
        total_loss, total_correct, total_labels = 0, 0, 0
        for batch in val_loader:
            text_lengths, sorted_idx = batch['text_lengths'].sort(descending=True)
            ids = batch['ids'][sorted_idx]
            targets = batch['targets'][sorted_idx]

            ids = ids.to(device)
            text_lengths = text_lengths.to(device)
            targets = targets.to(device)

            predictions = model(ids, text_lengths)
            loss = criterion(predictions, targets)

            total_loss += loss.item() * ids.size(0)
            total_correct += (predictions.round() == targets).sum().item()
            total_labels += ids.size(0)

        avg_loss = total_loss / total_labels
        avg_acc = total_correct / total_labels
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()), 'validation_loss': '{:.3f}'.format(avg_loss), 
                                  'validation_accuracy': '{:.3f}'.format(avg_acc)})

In [None]:
# # Saving in Colab
# path = "/content/drive/My Drive/AI classification/model.pth"

# # Save the model
# torch.save(model.state_dict(), path)

In [11]:
# Loading saved model
path = 'model.pth'
model = LSTMClassifier(len(tokenizer.vocab), 100)
model.load_state_dict(torch.load(path))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

LSTMClassifier(
  (embedding): Embedding(30522, 100)
  (lstm): LSTM(100, 32, batch_first=True)
  (drop): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)

In [12]:
from sklearn.metrics import accuracy_score

# Set the model to evaluation mode
model.eval()

# Initialize the list to store the targets and predictions
all_targets = []
all_predictions = []

# Start the evaluation loop
with torch.no_grad():
    for batch in test_loader:
        # Sort the sequences by length in descending order
        text_lengths, sorted_idx = batch['text_lengths'].sort(descending=True)
        ids = batch['ids'][sorted_idx]
        targets = batch['targets'][sorted_idx]

        ids = ids.to(device)
        text_lengths = text_lengths.to(device)
        targets = targets.to(device)

        # Forward pass
        predictions = model(ids, text_lengths).squeeze()

        # Convert the predictions and targets to the same data type and device
        predictions = torch.round(torch.sigmoid(predictions)).cpu().numpy()
        targets = targets.cpu().numpy()

        # Store the targets and predictions
        all_targets.extend(targets)
        all_predictions.extend(predictions)

# Calculate the accuracy
accuracy = accuracy_score(all_targets, all_predictions)
print(f'Accuracy: {accuracy}')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Accuracy: 0.517
