In [5]:
import pandas as pd
import shap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import numpy as np

RANDOM_STATE = 42

df = pd.read_csv('analyzed_data.csv')


features = df[['average_sentence_length', 'average_word_length', 'comma_frequency',
               'punctuation_frequency', 'unique_word_count', 'zipf_ratio']]
df['label'] = df['file_name'].apply(lambda x: "AI" if 'ai/' in x else "Human")
target = df['label']

print(df)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
importances = clf.feature_importances_
feature_names = features.columns

# Plotting Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances, color='skyblue')
plt.xlabel('Importance Score')
plt.title('Feature Importance in Random Forest Model')
plt.show()

ModuleNotFoundError: No module named 'shap'

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()        # Initialize scaler
X_train_scaled = scaler.fit_transform(X_train) # Standardize training features
X_test_scaled = scaler.transform(X_test) # Standardize testing features (using scaler.transform)


model = LogisticRegression(random_state=RANDOM_STATE)
model.fit(X_train_scaled, y_train)

pred = model.predict(X_test_scaled)  # Classify test set samples
acc_logreg_clf = accuracy_score(y_test, pred) # Compute accuracy scores


print(f"Logistic Regression model accuracy: {np.round(acc_logreg_clf, 3)}")

print("Class order:", model.classes_)


Logistic Regression model accuracy: 1.0
Class order: ['AI' 'Human']


In [None]:
import joblib

# Save the model and scaler
joblib.dump(model, "logistic_regression_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Model Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

KNN Model Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

          AI       1.00      1.00      1.00         8
       human       1.00      1.00      1.00        12

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20



In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree_clf = DecisionTreeClassifier(random_state=RANDOM_STATE)
tree_clf.fit(X_train, y_train)


y_tree_pred = tree_clf.predict(X_test)


print("Decision Tree Accuracy:", accuracy_score(y_test, y_tree_pred))

Decision Tree Accuracy: 0.95
