In [None]:
########################
### MODEL

import pandas as pd
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import gc

# Load your dataset
df = pd.read_csv('/content/sampled_model_df_burstiness2.csv')  # Replace with your dataset path

# Drop rows with None or NaN values in the 'text' column
df = df.dropna(subset=['text'])

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and labels
texts = df['text'].tolist()
additional_features = df[['avg line length', 'word density', 'mean_perplexity', 'burstiness2', 'flesch_kincaid_score', 'gunning_fog_score']].values
labels = df['label'].tolist()

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFAutoModel.from_pretrained("bert-base-uncased")

# Set the maximum sequence length
MAX_SEQ_LENGTH = 512

# Tokenize the text data in batches to save memory
def batch_tokenize(texts, tokenizer, batch_size=32, max_length=MAX_SEQ_LENGTH):
    input_ids_list = []
    attention_mask_list = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='tf')
        input_ids_list.append(inputs['input_ids'])
        attention_mask_list.append(inputs['attention_mask'])
    input_ids = tf.concat(input_ids_list, axis=0)
    attention_masks = tf.concat(attention_mask_list, axis=0)
    return input_ids, attention_masks

# Tokenize texts in batches
input_ids, attention_masks = batch_tokenize(texts, tokenizer)

# Convert labels to TensorFlow format
labels = tf.convert_to_tensor(labels, dtype=tf.int32)

# Split the data into training and testing sets
input_ids_train, input_ids_test, attention_mask_train, attention_mask_test, add_features_train, add_features_test, y_train, y_test = train_test_split(
    input_ids.numpy(), attention_masks.numpy(), additional_features, labels.numpy(), test_size=0.20, random_state=0
)

# Clear unused variables to save memory
del input_ids, attention_masks, additional_features, labels
gc.collect()

# Convert back to tensors
input_ids_train = tf.convert_to_tensor(input_ids_train)
input_ids_test = tf.convert_to_tensor(input_ids_test)
attention_mask_train = tf.convert_to_tensor(attention_mask_train)
attention_mask_test = tf.convert_to_tensor(attention_mask_test)
add_features_train = tf.convert_to_tensor(add_features_train)
add_features_test = tf.convert_to_tensor(add_features_test)
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)

# Create TensorFlow datasets
BATCH_SIZE = 4  # Further reduced batch size to fit into memory

def map_fn(input_ids, attention_mask, add_features, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'additional_features': add_features
    }, labels

# Training dataset
train_dataset = tf.data.Dataset.from_tensor_slices((input_ids_train, attention_mask_train, add_features_train, y_train))
train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE).map(map_fn, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

# Testing dataset
test_dataset = tf.data.Dataset.from_tensor_slices((input_ids_test, attention_mask_test, add_features_test, y_test))
test_dataset = test_dataset.batch(BATCH_SIZE).map(map_fn, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

class BERTWithAdditionalFeatures(tf.keras.Model):
    def __init__(self, bert_model, num_additional_features, num_classes):
        super().__init__()
        self.bert = bert_model
        self.additional_features_layer = tf.keras.layers.Dense(768, activation='relu')
        self.dropout = tf.keras.layers.Dropout(0.1)  # Add dropout layer
        self.concat_layer = tf.keras.layers.Concatenate()
        self.classifier = tf.keras.layers.Dense(num_classes, activation='sigmoid')

    def call(self, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        additional_features = inputs['additional_features']

        bert_output = self.bert(input_ids, attention_mask=attention_mask)[1]
        additional_features = self.additional_features_layer(additional_features)
        additional_features = self.dropout(additional_features)  # Apply dropout
        concatenated = self.concat_layer([bert_output, additional_features])
        output = self.classifier(concatenated)

        return output

num_additional_features = 6
num_classes = 1

model = BERTWithAdditionalFeatures(bert_model, num_additional_features, num_classes)

# Adjusted hyperparameters
learning_rate = 2e-5
epochs = 5

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

history = model.fit(
    train_dataset,
    epochs=epochs
)

results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}, Test Precision: {results[2]}, Test Recall: {results[3]}")

# Make predictions
predictions = model.predict(test_dataset)
predicted_labels = (predictions > 0.5).astype(int)

# Compute the confusion matrix and classification report
cm = confusion_matrix(y_test, predicted_labels)
print("Confusion Matrix:")
print(cm)

report = classification_report(y_test, predicted_labels)
print("Classification Report:")
print(report)

# Save the model
model.save('ai_vs_human_model')
