In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import stem
from nltk.corpus import stopwords
import unicodedata
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, RegexpParser
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import BertTokenizer, BertModel
from nltk.util import ngrams
import nltk
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('df.csv')

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

def preprocess_data(df):
    # Separate the target variable and features
    X = df.drop(columns=['is_ai'])
    y = df['is_ai']
    
    # Drop the 'tokens' and 'lemmas' columns as you don't want them
    X = X.drop(columns=['tokens', 'lemmas'])
    
    # Extract text column and other numerical columns
    X_text = X['text']  # Text column
    X_features = X.drop(columns=['text'])  # Other features

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000)
    X_text_vec = vectorizer.fit_transform(X_text)

    # Handle non-numeric columns in X_features
    X_numeric_features = X_features.select_dtypes(include=[np.number])

    # Scale the numeric features
    scaler = StandardScaler()
    X_numeric_scaled = scaler.fit_transform(X_numeric_features)

    # Combine the vectorized text features and scaled numeric features
    X_combined = hstack((X_text_vec, X_numeric_scaled))

    # Create a new DataFrame with combined features and target variable
    df_processed = pd.DataFrame.sparse.from_spmatrix(X_combined)

    # Add back the non-text columns (keeping the ones you want)
    non_numeric_columns = X_features.select_dtypes(exclude=[np.number]).columns
    df_processed[non_numeric_columns] = X_features[non_numeric_columns]

    # Add the target variable back to the DataFrame
    df_processed['is_ai'] = y

    with open('tfidf_vectorizer.pkl', 'wb') as file:
        pickle.dump(vectorizer, file)
        
    return df_processed

In [9]:
df_processed = preprocess_data(df)

In [71]:
df_processed.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5006,5007,5008,5009,5010,5011,5012,5013,5014,is_ai
0,0,0,0,0,0,0,0,0,0,0,...,2.410252,-0.14851,-0.393955,-0.00391,-0.228649,1.824225,6.387063,-0.040032,0.708213,0


## **SVM**

In [72]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Assuming df_processed is your processed DataFrame
X = df_processed.drop(columns=['is_ai'])  # Features
y = df_processed['is_ai']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LinearSVC model (which can handle sparse matrices)
svm_model = LinearSVC()

# Train the model using the sparse matrix
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       987
           1       0.91      0.92      0.92      1013

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000



In [77]:
with open('svm_model_1.pkl', 'wb') as f:
    pickle.dump(svm_model, f)

## **RF**

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

# Assuming df_processed is the output of your preprocess_data function
# df_processed = preprocess_data(df_balanced)

# Separate the target variable and features
X = df_processed.drop(columns=['is_ai'])
y = df_processed['is_ai']

# Extract the text features (sparse matrix)
X_text_vec = X.iloc[:, :-1].sparse.to_coo()  # Exclude the target column 'is_ai' and keep sparse format

# Extract numerical features (make sure they are sparse as well)
X_numeric_scaled = X.iloc[:, -1:].sparse.to_coo()  # Assuming last column is the numerical features

# Combine the sparse matrix for text features and numerical features
X_combined = hstack((X_text_vec, X_numeric_scaled))  # Keep sparse format

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.93


In [78]:
with open('rf_model_1.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

## **Convultional Neural Network (CNN)**

In [80]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the DataFrame (assuming df is already loaded)
# Assuming 'df' has your feature columns (0 to 5013) and the target column 'is_ai'

# Split data into input features and target variable
X = df_processed.drop(columns=['is_ai']).values  # All columns except 'is_ai'
y = df_processed['is_ai'].values  # Target variable (whether it's AI or not)

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Reshape input data for CNN (this is important for CNN's expected input)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))  # Reshaping for CNN (samples, features, channels)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))  # Same for the test data

# Step 4: Define the CNN Model
model = models.Sequential([
    layers.Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], 1)),  # Conv1D layer for text features
    layers.MaxPooling1D(2),  # Max pooling layer
    layers.Conv1D(128, 3, activation='relu'),  # Another Conv1D layer for more feature extraction
    layers.MaxPooling1D(2),
    layers.Flatten(),  # Flatten the 1D features into a vector
    layers.Dense(128, activation='relu'),  # Fully connected layer
    layers.Dense(1, activation='sigmoid')  # Output layer (sigmoid for binary classification)
])

# Step 5: Compile the Model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Step 6: Train the Model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Step 7: Evaluate the Model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 549ms/step - accuracy: 0.7838 - loss: 0.4104 - val_accuracy: 0.8930 - val_loss: 0.2341
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 491ms/step - accuracy: 0.9332 - loss: 0.1534 - val_accuracy: 0.9060 - val_loss: 0.2161
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 446ms/step - accuracy: 0.9728 - loss: 0.0716 - val_accuracy: 0.9030 - val_loss: 0.2627
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 445ms/step - accuracy: 0.9912 - loss: 0.0280 - val_accuracy: 0.9045 - val_loss: 0.3410
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 445ms/step - accuracy: 0.9958 - loss: 0.0142 - val_accuracy: 0.8865 - val_loss: 0.4735
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 445ms/step - accuracy: 0.9962 - loss: 0.0130 - val_accuracy: 0.8975 - val_loss: 0.4983
Epoc

In [81]:
model.save('cnn_model_1.keras')

## **Voting Classifier**

In [86]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib

# Load the pre-trained Random Forest and SVM models
rf_model = joblib.load('rf_model_1.pkl')     # Random Forest model
svm_model = joblib.load('svm_model_1.pkl')   # SVM model

# Prepare your data (X, y). Assuming X and y are already preprocessed
# Example of splitting the data if not done already:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Voting Classifier with only Random Forest and SVM models
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),     # Random Forest model
        ('svm', svm_model)    # SVM model
    ],
    voting='hard'  # Use 'hard' for majority class voting (you can also use 'soft' for probability-based voting)
)

# Fit the Voting Classifier on the training data
voting_clf.fit(X_train, y_train)

# Make predictions using the Voting Classifier
y_pred = voting_clf.predict(X_test)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Classifier Accuracy: {accuracy:.4f}")

# Print detailed classification report for precision, recall, and f1-score
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)


Voting Classifier Accuracy: 0.9200
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       987
           1       0.96      0.88      0.92      1013

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000

