In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re

file_path = "/content/train_v2_drcat_02.csv"
data = pd.read_csv(file_path)

# Define a preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-z\s]", "", text)  # Remove special characters and numbers
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespace
    return text

# Apply preprocessing to the 'text' column
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Split the data into training and testing sets
X = data['cleaned_text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Use TF-IDF Vectorizer for feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Unigrams and bigrams
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Check the number of features extracted
print(f"Number of features extracted: {X_train_tfidf.shape[1]}")

Number of features extracted: 5000


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Train a Logistic Regression model
model = LogisticRegression(class_weight='balanced', max_iter=500)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9933140182750168

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5474
           1       0.99      0.99      0.99      3500

    accuracy                           0.99      8974
   macro avg       0.99      0.99      0.99      8974
weighted avg       0.99      0.99      0.99      8974



In [None]:
# Function to preprocess the input text
def preprocess_custom_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Example text
custom_text = "Alex"

# Preprocess and transform the custom text
processed_text = preprocess_custom_text(custom_text)
text_tfidf = tfidf_vectorizer.transform([processed_text])

# Predict the label
predicted_label = model.predict(text_tfidf)[0]

# Output the prediction
label_map = {0: "Human-Written", 1: "AI-Generated"}
print(f"The text is classified as: {label_map[predicted_label]}")

NameError: name 'tfidf_vectorizer' is not defined

In [None]:
!pip install keras



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Tokenize the text
vocab_size = 5000  # Limit the vocabulary size
max_length = 200  # Maximum length of a sequence
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [None]:
# Build the model
embedding_dim = 128
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [None]:
# Train the model
history = model.fit(
    X_train_padded, y_train,
    validation_data=(X_test_padded, y_test),
    epochs=5,
    batch_size=64,
    verbose=1
)

Epoch 1/5
[1m275/561[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m1:42[0m 359ms/step - accuracy: 0.9806 - loss: 0.0670

In [None]:
# Evaluate on the test set
loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=1)
print(f"Test Accuracy: {accuracy}")

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 94ms/step - accuracy: 0.9874 - loss: 0.0408
Test Accuracy: 0.9880766868591309


In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Preprocess custom text
model = load_model('/content/lstm_text_classifier.keras')
custom_text = "Alex"
custom_seq = tokenizer.texts_to_sequences([custom_text])
custom_padded = pad_sequences(custom_seq, maxlen=max_length, padding='post', truncating='post')

# Predict
prediction = model.predict(custom_padded)[0][0]
label = 1 if prediction > 0.5 else 0
label_map = {0: "Human-Written", 1: "AI-Generated"}
print(f"The text is classified as: {label_map[label]}")

  saveable.load_own_variables(weights_store.get(inner_path))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step
The text is classified as: AI-Generated


In [None]:
model.save('lstm_text_classifier.keras')

Dataset Preview

In [None]:
file_path = '/content/train_v2_drcat_02.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
# print(data.head())

# Alternatively, if you want a nicely formatted preview in Jupyter Notebook:
from IPython.display import display
display(data.head())

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


## ***In Class Live Demo***

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

model = load_model('/content/lstm_text_classifier.keras')

while True:
    # Get input text from the user
    user_input = input("Enter text (or type 'exit' to quit): ")

    if user_input.lower() == 'exit':
        print("Exiting...")
        break

    # Preprocess the input text
    processed_text = preprocess_text(user_input)

    text_seq = tokenizer.texts_to_sequences([processed_text])
    text_padded = pad_sequences(text_seq, maxlen=200, padding='post')

    prediction = model.predict(text_padded)[0][0]
    label = 1 if prediction > 0.5 else 0
    label_map = {0: "Human-Written", 1: "AI-Generated"}
    print(f"The text is classified as: {label_map[label]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428ms/step
The text is classified as: Human-Written
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
The text is classified as: AI-Generated
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
The text is classified as: AI-Generated
