1️⃣ DATA LOADING & PREPROCESSING

In [1]:
import pandas as pd
import re
import string


In [2]:
df = pd.read_excel("AIGTxt.xlsx")   # columns: text, label
df.head()


Unnamed: 0,Human-Generated,ChatGPT-Generated,Mixed Text,Domain,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Photo-realistic image rendering using standard...,Achieving photo-realistic image rendering thro...,Achieving photo-realistic image rendering thro...,Computer Science and Artificial Intelligence,,,
1,The Large Hadron Collider is exploring physics...,The Large Hadron Collider (LHC) serves as a fr...,The Large Hadron Collider (LHC) serves as a fr...,Computer Science and Artificial Intelligence,,,
2,With the rapid development of mobile Internet ...,In the era of swift advancements in mobile Int...,In the era of swift advancements in mobile Int...,Computer Science and Artificial Intelligence,,,
3,Most contour tracking methods can be grouped i...,Contour tracking methods can be broadly catego...,Contour tracking methods can be broadly catego...,Computer Science and Artificial Intelligence,,,
4,Year 2010 is regarded as the breakthrough year...,The pivotal year 2010 marked a significant bre...,The pivotal year 2010 marked a significant bre...,Computer Science and Artificial Intelligence,,,


In [17]:
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# Reset index after deletion
df.reset_index(drop=True, inplace=True)

print(df.isnull().sum())

Human-Generated      0
ChatGPT-Generated    0
Mixed Text           0
Domain               0
clean_text           0
dtype: int64


In [19]:
df.head()

Unnamed: 0,Human-Generated,ChatGPT-Generated,Mixed Text,Domain,clean_text
0,Photo-realistic image rendering using standard...,Achieving photo-realistic image rendering thro...,Achieving photo-realistic image rendering thro...,Computer Science and Artificial Intelligence,photorealistic image rendering using standard ...
1,The Large Hadron Collider is exploring physics...,The Large Hadron Collider (LHC) serves as a fr...,The Large Hadron Collider (LHC) serves as a fr...,Computer Science and Artificial Intelligence,the large hadron collider is exploring physics...
2,With the rapid development of mobile Internet ...,In the era of swift advancements in mobile Int...,In the era of swift advancements in mobile Int...,Computer Science and Artificial Intelligence,with the rapid development of mobile internet ...
3,Most contour tracking methods can be grouped i...,Contour tracking methods can be broadly catego...,Contour tracking methods can be broadly catego...,Computer Science and Artificial Intelligence,most contour tracking methods can be grouped i...
4,Year 2010 is regarded as the breakthrough year...,The pivotal year 2010 marked a significant bre...,The pivotal year 2010 marked a significant bre...,Computer Science and Artificial Intelligence,year is regarded as the breakthrough year of d...


In [20]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['Human-Generated'].apply(clean_text)

2️⃣ FEATURE EXTRACTION (TF-IDF)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [22]:
import pandas as pd

# Assuming the goal is to classify between Human-Generated and ChatGPT-Generated texts
# Create a DataFrame for Human-Generated texts with label 1
human_data = pd.DataFrame({
    'text': df['Human-Generated'].apply(clean_text),
    'label': 1 # Label for Human-Generated
})

# Create a DataFrame for ChatGPT-Generated texts with label 0
ai_data = pd.DataFrame({
    'text': df['ChatGPT-Generated'].apply(clean_text),
    'label': 0 # Label for ChatGPT-Generated
})

# Concatenate the two DataFrames to create a unified dataset for classification
combined_df = pd.concat([human_data, ai_data], ignore_index=True)

# Features (text) and Target (labels) for classification
X = combined_df['text']
y = combined_df['label']   # 0 = AI, 1 = Human

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

3️⃣ TRAIN–TEST SPLIT

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)


In [24]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 529486 stored elements and shape (5771, 5000)>

4️⃣ MODEL TRAINING

In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)


5️⃣ MODEL EVALUATION

In [28]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9404019404019404

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       704
           1       0.94      0.95      0.94       739

    accuracy                           0.94      1443
   macro avg       0.94      0.94      0.94      1443
weighted avg       0.94      0.94      0.94      1443


Confusion Matrix:
 [[656  48]
 [ 38 701]]


6️⃣ SAVE MODEL & VECTORIZER

In [29]:
import pickle
with open("aig_detector_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)
