###  Linas bla bla bla

In [1]:
# Import necessary libraries
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import metrics

In [2]:
# Define the base path relative to the project root
base_path = Path('/Users/lina/code/yukaberry/detect_ai_content/raw_data/new_datasets')
df_ai_human = pd.read_csv(base_path / 'AI_Human.csv')

###  Model 1: MultinomialNB

In [3]:
# Feature Extraction and Label Preparation
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_ai_human['text'])  # Transform text to TF-IDF features
y = df_ai_human['generated']  # Target labels

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and Evaluate Model - Multinomial Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)  # Train the model

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate Model
accuracy = metrics.accuracy_score(y_test, y_pred)
classification_report = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

print("Model: Multinomial Naive Bayes")
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report)
print("Confusion Matrix:\n", confusion_matrix)

Model: Multinomial Naive Bayes
Accuracy: 0.9492852524962287
Classification Report:
               precision    recall  f1-score   support

         0.0       0.93      0.99      0.96     61112
         1.0       0.98      0.88      0.93     36335

    accuracy                           0.95     97447
   macro avg       0.96      0.94      0.94     97447
weighted avg       0.95      0.95      0.95     97447

Confusion Matrix:
 [[60484   628]
 [ 4314 32021]]


### Model 2: TfidfVectorizer + Support Vector Machine (SVM)

In [5]:

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

# I ran the classic SVM, dummy me, it too heavy for my machine so i decide to subset (e.g., 10%) the data for faster experimentation

df_sample = df_ai_human.sample(frac=0.1, random_state=42)

# Feature Extraction and Label Preparation
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_sample['text'])
y = df_sample['generated']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and Evaluate Model - Linear SVC
model = LinearSVC()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate Model
accuracy = metrics.accuracy_score(y_test, y_pred)
classification_report = metrics.classification_report(y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

print("Model: LinearSVC with Sampled Data")
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report)
print("Confusion Matrix:\n", confusion_matrix)

Model: LinearSVC with Sampled Data
Accuracy: 0.9936377629553618
Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      1.00      0.99      6144
         1.0       0.99      0.99      0.99      3601

    accuracy                           0.99      9745
   macro avg       0.99      0.99      0.99      9745
weighted avg       0.99      0.99      0.99      9745

Confusion Matrix:
 [[6118   26]
 [  36 3565]]


### Model 3: XGBoost Setup for Gradient Boosting

In [9]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.2


In [12]:
'''For an efficient implementation of Gradient Boosting with your large dataset, 
we’ll use to XGBoost, optimized for handling large datasets and high-dimensional data 
as the TF-IDF text vectors.
'''

from xgboost import XGBClassifier
from sklearn.metrics import classification_report


# Feature Extraction with TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df_ai_human['text'])  # Transform text to TF-IDF features
y = df_ai_human['generated']  # Target labels

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define XGBoost classifier with balanced parameters
xgb_model = XGBClassifier(
    n_estimators=50,         # Start with a lower number of trees
    max_depth=3,             # Small tree depth for speed
    learning_rate=0.1,       # Default rate, can be adjust it
    eval_metric='logloss'    # Standard metric for binary classification
)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.98      0.96     61112
         1.0       0.96      0.90      0.93     36335

    accuracy                           0.95     97447
   macro avg       0.95      0.94      0.95     97447
weighted avg       0.95      0.95      0.95     97447



### Model 5: Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [19]:
# Adjust data and model parameters (make sure X and y are defined correctly)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define Random Forest classifier with balanced parameters
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = rf_model.score(X_test, y_test)
classification_report_rf = classification_report(y_test, y_pred)
confusion_matrix_rf = confusion_matrix(y_test, y_pred)

# Print the results
print("Model: Random Forest")
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_report_rf)
print("Confusion Matrix:\n", confusion_matrix_rf)

Model: Random Forest
Accuracy: 0.9512247683356081
Classification Report:
               precision    recall  f1-score   support

         0.0       0.94      0.98      0.96     61112
         1.0       0.96      0.90      0.93     36335

    accuracy                           0.95     97447
   macro avg       0.95      0.94      0.95     97447
weighted avg       0.95      0.95      0.95     97447

Confusion Matrix:
 [[59907  1205]
 [ 3548 32787]]
