In [1]:
import pandas as pd
import string
import pickle
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
# dataset were gathered from kaggle using 2 sources
data1 = pd.read_csv("/kaggle/input/ai-vs-human-text/AI_Human.csv")
data2 = pd.read_csv("/kaggle/input/daigt-v4-train-dataset/train_v4_drcat_01.csv")

In [3]:
data2.columns

Index(['text', 'label', 'prompt_name', 'source', 'RDizzl3_seven', 'model'], dtype='object')

In [4]:
# rename and select columns for consistency
data1 = data1[['text', 'generated']]
data2.rename(columns={'label': 'generated'}, inplace=True)
data2 = data2[['text', 'generated']]

data = pd.concat([data1, data2], ignore_index=True)

In [5]:
data['text'] = data['text'].str.lower()
data.drop_duplicates(inplace=True)

In [6]:
def rm_punc(txt):
    return ''.join([ch for ch in txt if ch not in string.punctuation])

data['text'] = data['text'].apply(rm_punc)

stop_words = set(stopwords.words('english'))

In [7]:
def rm_stop(txt):
    words = txt.split()
    clean_words = [word for word in words if word not in stop_words]
    return ' '.join(clean_words)

data['text'] = data['text'].apply(rm_stop)


In [8]:
# split 60, 40
x_tr, x_te, y_tr, y_te = train_test_split(
    data['text'], data['generated'], test_size=0.4, random_state=56, stratify=data['generated']
)

In [9]:
model_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

model_pipeline.fit(x_tr, y_tr)

In [10]:
# eval
y_pred = model_pipeline.predict(x_te)
print(classification_report(y_te, y_pred))


              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95    122339
         1.0       0.98      0.88      0.93     84969

    accuracy                           0.94    207308
   macro avg       0.95      0.93      0.94    207308
weighted avg       0.95      0.94      0.94    207308



In [11]:
# Save the trained model
with open('ai_text_detector_model.pkl', 'wb') as model_file:
    pickle.dump(model_pipeline, model_file)

In [12]:
with open('/kaggle/working/ai_text_detector_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

In [4]:
def test_model(input_text):
    # Preprocess input
    input_text = input_text.lower()
    input_text = rm_punc(input_text)
    input_text = rm_stop(input_text)

    # Get predictions and confidence
    pred = loaded_model.predict([input_text])[0]
    classifier = loaded_model.named_steps['classifier']
    vectorizer = loaded_model.named_steps['vectorizer']
    tfidf = loaded_model.named_steps['tfidf']

    vectorized_text = vectorizer.transform([input_text])
    tfidf_text = tfidf.transform(vectorized_text)
    pred_prob = classifier.predict_proba(tfidf_text)[0]
    
    confidence = np.max(pred_prob)
    label = "AI generated" if pred == 1 else "Human Written"
    
    return label, confidence

# Example usage
input_text = "Yesterday, I walked to the park to clear my head. The air was crisp, and the leaves crunched underfoot. I couldn’t help but feel a sense of nostalgia, remembering how my siblings and I used to play here when we were kids. It’s funny how places can hold so many memories."
label, confidence = test_model(input_text)
print(f"Prediction: {label}, Confidence: {confidence:.2f}")

Prediction: Human Written, Confidence: 0.58
