In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [None]:
date_set = pd.read_csv("/kaggle/input/ai-generated-text-preprocessed/preprocessed_data.csv")
date_set.head()

In [None]:
df_train_essays_final_shuffled = date_set.sample(frac=1).reset_index(drop=True)
df_train_essays_final = df_train_essays_final_shuffled[100000:180000]

In [None]:
df_train_essays_final['generated'].value_counts()

In [None]:
only_zero_rows = df_train_essays_final[df_train_essays_final['generated'] == 0]
only_one_rows = df_train_essays_final[df_train_essays_final['generated'] == 1]

In [None]:
only_one_rows.shape[0]

In [None]:
rows = min(only_one_rows.shape[0],only_zero_rows.shape[0])
print(rows)

In [None]:
equal_one_rows = only_one_rows[:rows]
equal_zero_rows = only_zero_rows[:rows]

# Create a combined dataset with exactly 10 1s and 10 0s
combined_dataset = pd.concat([equal_one_rows, equal_zero_rows])
combined_dataset['generated'].value_counts()

In [None]:
combined_dataset['generated'].value_counts().plot(kind='bar', rot=0, color=['blue', 'orange'])

plt.xlabel('Values')
plt.ylabel('Count')
plt.title('Bar Plot of 1s and 0s')

plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(combined_dataset['essay'],combined_dataset['generated'])

## Ensembling bert with distilbert 

### Load the saved bert model 

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]
# Create a new model with the same architecture
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

new_model = tf.keras.Model(inputs=[text_input], outputs=[l])
new_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=METRICS)

# Load the saved weights into the new model
new_model.load_weights('/kaggle/input/model-weights/my_model_weights.h5')

### Load the saved distilbert model 

In [None]:
from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

MODEL_NAME = 'distilbert-base-uncased'
N_EPOCHS = 10

model_distilbert = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=3e-5)
loss_fn  = losses.SparseCategoricalCrossentropy(from_logits=True)
model_distilbert.compile(optimizer=optimizer, loss=loss_fn , metrics=['accuracy'])
model_distilbert.load_weights('/kaggle/input/distilbert-ml-project/new_distilbert_model_weights.h5')


### Prepare data for distilbert model

In [None]:
def construct_encodings(x, tkzr, trucation=True, padding=True):
    return tkzr(x, truncation=trucation, padding=padding)

def construct_tfdataset(encodings, y=None):
    if y:
        return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    else:
        # this case is used when making predictions on unseen samples after training
        return tf.data.Dataset.from_tensor_slices(dict(encodings))  

tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)

## it will create X,y
def create_tf_set(dataX, dataY, tkzr):
    encodings = construct_encodings(dataX, tkzr)
    tfdataset = construct_tfdataset(encodings, dataY)
    return tfdataset    


In [None]:
X_test_list = X_test.tolist()
y_test_list = y_test.tolist()
X_train_list = X_train.tolist()
y_train_list = y_train.tolist()


In [None]:
tf_test = create_tf_set(X_test_list,y_test_list,tkzr)
tf_train = create_tf_set(X_train_list,y_train_list,tkzr)
BATCH_SIZE = 16
tfdataset_test = tf_test.batch(BATCH_SIZE)
tfdataset_train = tf_train.batch(BATCH_SIZE)

In [None]:
val = model_distilbert.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
print(val)

## Ensembling averaging two models' predictions 

In [None]:
preds_model1 = new_model.predict(X_test)
# preds_model2 = loaded_xlnet.predict(test_input_ids)
preds_model2 = model_distilbert.predict(tfdataset_test)
# print(preds_model1.shape)
preds_model2 = preds_model2.logits

# Convert logits to probabilities using softmax
preds_model2 = tf.nn.softmax(preds_model2, axis=-1)
preds_model2 = preds_model2[:, 1].numpy()
preds_model2 = preds_model2.reshape(-1, 1)
# print(preds_model2.shape)
ensemble_preds = (preds_model1 + preds_model2) / 2
# print(ensemble_preds[0])

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

def calc_accuracy(ensemble_preds, y_test):
    
    y_pred = ensemble_preds.flatten()
#     print(y_pred.shape, ensemble_preds.shape)
    y_pred = np.where(y_pred > 0.5, 1, 0)
    # Assuming y_pred and y_test are flattened arrays with 0/1 values
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f'Accuracy: {accuracy * 100:.2f}%')
    
    # Plot confusion matrix using seaborn
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=range(2), yticklabels=range(2))
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

In [None]:
# print(len(y_test), len(ensemble_preds))
calc_accuracy(ensemble_preds, y_test)

## Ensembling using voting 

In [None]:
preds_model1 = (new_model.predict(X_test) > 0.5).astype(int)
# preds_model2 = (loaded_xlnet.predict(test_input_ids) > 0.5).astype(int)
preds_model2 = model_distilbert.predict(tfdataset_test)
# print(preds_model1.shape)
preds_model2 = preds_model2.logits

# Convert logits to probabilities using softmax
preds_model2 = tf.nn.softmax(preds_model2, axis=-1)
preds_model2 = preds_model2[:, 1].numpy()
preds_model2 = preds_model2.reshape(-1, 1)
preds_model2 = (preds_model2 > 0.5).astype(int)
ensemble_preds = (preds_model1 + preds_model2) >= 1

In [None]:
calc_accuracy(ensemble_preds, y_test)

## Ensembling using stacking

In [None]:
# preds_model1_train = new_model.predict(X_train)
# preds_model2_train = loaded_xlnet.predict(train_input_ids)

# meta_model_input = np.concatenate([preds_model1_train, preds_model2_train], axis=1)

# meta_model = tf.keras.Sequential([
#     tf.keras.layers.Dense(1, activation='sigmoid', input_dim=2),  # Adjust input_dim based on the number of base models
# ])

# meta_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# meta_model.fit(meta_model_input, y_train, epochs=10, batch_size=32)

# # Make predictions on test data
# preds_model1_test = new_model.predict(X_test)
# preds_model2_test = loaded_xlnet.predict(test_input_ids)

# meta_model_input_test = np.concatenate([preds_model1_test, preds_model2_test], axis=1)

# ensemble_preds = meta_model.predict(meta_model_input_test)


In [None]:
preds_model1_train = new_model.predict(X_train)
preds_model2_train = model_distilbert.predict(tfdataset_train)
preds_model2_train = preds_model2_train.logits

# Convert logits to probabilities using softmax
preds_model2_train = tf.nn.softmax(preds_model2_train, axis=-1)
preds_model2_train = preds_model2_train[:, 1].numpy()
preds_model2_train = preds_model2_train.reshape(-1, 1)


meta_model_input = np.concatenate([preds_model1_train, preds_model2_train], axis=1)

meta_model = tf.keras.Sequential([
    tf.keras.layers.Dense(1, activation='sigmoid', input_dim=2),  # Adjust input_dim based on the number of base models
])

meta_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
meta_model.fit(meta_model_input, y_train, epochs=10, batch_size=32)

# Make predictions on test data
preds_model1_test = new_model.predict(X_test)
preds_model2_test = model_distilbert.predict(tfdataset_test)
# print(preds_model1.shape)
preds_model2_test = preds_model2_test.logits

# Convert logits to probabilities using softmax
preds_model2_test = tf.nn.softmax(preds_model2_test, axis=-1)
preds_model2_test = preds_model2_test[:, 1].numpy()
preds_model2_test = preds_model2_test.reshape(-1, 1)

meta_model_input_test = np.concatenate([preds_model1_test, preds_model2_test], axis=1)

ensemble_preds = meta_model.predict(meta_model_input_test)


In [None]:
calc_accuracy(ensemble_preds, y_test)