In [3]:
import joblib
import numpy as np
import os
import spacy
import pandas as pd

from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import ngrams
from scipy.stats import entropy
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

In [4]:
df_csv = pd.read_csv("../data/dataset_2/AIGTxt_DataSet.csv")
df = pd.melt(df_csv, id_vars=['Domain'], value_vars=['Human-generated text', 'ChatGPT-generated text', 'Mixed text'],
                  var_name='Class', value_name='Text')
df.head()

Unnamed: 0,Domain,Class,Text
0,Astrophysics and Astronomy,Human-generated text,Low-thrust electric propulsion offers a higher...
1,Astrophysics and Astronomy,Human-generated text,Relative navigation is one of the key enable t...
2,Astrophysics and Astronomy,Human-generated text,"In recent years, the technology of space tethe..."
3,Astrophysics and Astronomy,Human-generated text,"In celestial mechanics and mathematics, dynami..."
4,Astrophysics and Astronomy,Human-generated text,The study of attitude dynamics within a three-...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Domain  3000 non-null   object
 1   Class   3000 non-null   object
 2   Text    3000 non-null   object
dtypes: object(3)
memory usage: 70.4+ KB


In [6]:
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc 
        if (token.is_alpha and not token.is_stop and len(token) > 2)
    ]
    return tokens

In [7]:
df['tokens'] = df['Text'].apply(clean_text)
df.head()

Unnamed: 0,Domain,Class,Text,tokens
0,Astrophysics and Astronomy,Human-generated text,Low-thrust electric propulsion offers a higher...,"[low, thrust, electric, propulsion, offer, hig..."
1,Astrophysics and Astronomy,Human-generated text,Relative navigation is one of the key enable t...,"[relative, navigation, key, enable, technology..."
2,Astrophysics and Astronomy,Human-generated text,"In recent years, the technology of space tethe...","[recent, year, technology, space, tether, draw..."
3,Astrophysics and Astronomy,Human-generated text,"In celestial mechanics and mathematics, dynami...","[celestial, mechanic, mathematic, dynamical, s..."
4,Astrophysics and Astronomy,Human-generated text,The study of attitude dynamics within a three-...,"[study, attitude, dynamic, body, problem, esse..."


In [8]:
def type_token_ratio(words):
    return len(set(words)) / len(words) if words else 0

def calculate_entropy(words):
    word_freq = Counter(words)
    probabilities = np.array(list(word_freq.values())) / len(words)
    return entropy(probabilities)

def perplexity(words):
    return np.exp(calculate_entropy(words))

def sentence_paragraph_variability(text, words):
    sentences = sent_tokenize(text)
    sentence_lengths = [len(word_tokenize(sent)) for sent in sentences]
    paragraphs = text.split("\n")
    paragraph_lengths = [len(word_tokenize(para)) for para in paragraphs if para.strip()]
    return np.std(sentence_lengths), np.std(paragraph_lengths)

def ngram_distribution(words, n=2):
    n_grams = list(ngrams(words, n))
    return Counter(n_grams).most_common(5)

def abstractness(words):
    abstract_words = {"concept", "idea", "theory", "model", "framework", "paradigm"}
    return len([word for word in words if word in abstract_words]) / len(words) if words else 0

def repetitiveness(words):
    word_freq = Counter(words)
    return max(word_freq.values()) / len(words) if words else 0

def syntactic_complexity(text):
    doc = nlp(text)
    complex_sentences = sum(1 for sent in doc.sents if sum(1 for token in sent if token.dep_ in ['advcl', 'ccomp', 'xcomp']) > 0)
    return complex_sentences / len(list(doc.sents)) if doc.sents else 0

def avg_word_sentence_length(words, text):
    sentences = sent_tokenize(text)
    avg_word_len = np.mean([len(word) for word in words]) if words else 0
    avg_sent_len = np.mean([len(word_tokenize(sent)) for sent in sentences]) if sentences else 0
    return avg_word_len, avg_sent_len

In [9]:
df['tokens'] = df['Text'].apply(clean_text)
df['ttr'] = df['tokens'].apply(type_token_ratio)
df['entropy'] = df['tokens'].apply(calculate_entropy)
df['perplexity'] = df['tokens'].apply(perplexity)
df['sentence_var'], df['paragraph_var'] = zip(*df.apply(
    lambda row: sentence_paragraph_variability(row['Text'], row['tokens']), axis=1))
df['abstractness'] = df['tokens'].apply(abstractness)
df['repetitiveness'] = df['tokens'].apply(repetitiveness)
df['syntactic_complexity'] = df['Text'].apply(syntactic_complexity)
df['avg_word_length'], df['avg_sentence_length'] = zip(*df.apply(
    lambda row: avg_word_sentence_length(row['tokens'], row['Text']), axis=1))

In [13]:
df.head()

Unnamed: 0,Domain,Class,Text,tokens,ttr,entropy,perplexity,sentence_var,paragraph_var,bigram_distribution,trigram_distribution,abstractness,repetitiveness,syntactic_complexity,avg_word_length,avg_sentence_length
0,Astrophysics and Astronomy,Human-generated text,Low-thrust electric propulsion offers a higher...,"[low, thrust, electric, propulsion, offer, hig...",0.735632,3.991781,54.151263,18.562327,0.000000,"[((low, thrust), 2), ((mission, design), 2), (...","[((low, thrust, electric), 1), ((thrust, elect...",0.000000,0.091954,0.250000,7.034483,38.800000
1,Astrophysics and Astronomy,Human-generated text,Relative navigation is one of the key enable t...,"[relative, navigation, key, enable, technology...",0.648438,4.222171,68.181373,9.904276,0.000000,"[((relative, navigation), 5), ((non, cooperati...","[((non, cooperative, target), 4), ((angles, re...",0.007812,0.039062,0.294118,7.312500,19.538462
2,Astrophysics and Astronomy,Human-generated text,"In recent years, the technology of space tethe...","[recent, year, technology, space, tether, draw...",0.740000,4.122402,61.707279,6.318098,0.000000,"[((space, mission), 2), ((end, body), 2), ((sp...","[((recent, year, technology), 1), ((year, tech...",0.000000,0.090000,0.181818,6.800000,30.714286
3,Astrophysics and Astronomy,Human-generated text,"In celestial mechanics and mathematics, dynami...","[celestial, mechanic, mathematic, dynamical, s...",0.758621,4.316672,74.938812,13.891882,6.500000,"[((configuration, primary), 3), ((body, proble...","[((celestial, mechanic, mathematic), 1), ((mec...",0.017241,0.051724,0.222222,7.387931,31.375000
4,Astrophysics and Astronomy,Human-generated text,The study of attitude dynamics within a three-...,"[study, attitude, dynamic, body, problem, esse...",0.884058,4.058210,57.870636,13.928388,0.000000,"[((study, attitude), 1), ((attitude, dynamic),...","[((study, attitude, dynamic), 1), ((attitude, ...",0.000000,0.043478,0.750000,7.304348,56.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Astrophysics and Astronomy,Human-generated text,After two attempts on the evenings of July 19 ...,"[attempt, evening, july, july, observatory, la...",0.767857,4.333932,76.243515,7.071068,4.123525,"[((space, shuttle), 2), ((attempt, evening), 1...","[((attempt, evening, july), 1), ((evening, jul...",0.000000,0.044643,0.333333,6.330357,23.000000
96,Astrophysics and Astronomy,Human-generated text,The spectropolarimeter described by Miller et ...,"[spectropolarimeter, describe, miller, achroma...",0.827160,4.111034,61.009768,9.781282,5.940539,"[((dual, beam), 2), ((frequency, high), 2), ((...","[((spectropolarimeter, describe, miller), 1), ...",0.000000,0.061728,0.500000,7.802469,24.428571
97,Astrophysics and Astronomy,Human-generated text,Astronomical optical telescopes use glass or c...,"[astronomical, optical, telescope, use, glass,...",0.739130,4.036732,56.640947,15.563880,3.623381,"[((liquid, mirror), 4), ((observe, strip), 2),...","[((astronomical, optical, telescope), 1), ((op...",0.000000,0.076087,0.500000,6.989130,28.625000
98,Astrophysics and Astronomy,Human-generated text,A continuous scanning of the sky using CCDs im...,"[continuous, scanning, sky, ccds, imply, speci...",0.813725,4.331952,76.092691,9.764576,6.127889,"[((high, resolution), 2), ((continuous, scanni...","[((continuous, scanning, sky), 1), ((scanning,...",0.000000,0.049020,0.428571,6.411765,31.714286


In [11]:
df.to_csv('../data/dataset_2/preprocessed_df_custom_features.csv', index=False)

In [14]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=42, stratify=df['Class'])

df_train.shape, df_test.shape

((2100, 16), (900, 16))

In [15]:
feature_columns = ['ttr', 'entropy', 'perplexity', 'sentence_var', 'paragraph_var', 'abstractness', 'repetitiveness', 'syntactic_complexity', 'avg_word_length', 'avg_sentence_length']
X_train, X_test = df_train[feature_columns], df_test[feature_columns]

In [16]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Class'])
y_test = label_encoder.transform(df_test['Class'])

joblib.dump(label_encoder, '../models/dataset_2/label_encoder.joblib')
print("Klasy:", label_encoder.classes_)

Klasy: ['ChatGPT-generated text' 'Human-generated text' 'Mixed text']


In [18]:
models_and_parameters = {
    'model_name': 'LR',
    'estimator': LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=42),
    'param_grid': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['saga', 'lbfgs']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_custom_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: LR
Fitting 5 folds for each of 12 candidates, totalling 60 fits


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\weran\PycharmProjects\

Najlepsze parametry dla LR: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Najlepsza dokładność (CV) dla LR: 0.4581
Model LR zapisany jako: ../models/dataset_2\LR_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.4622


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
models_and_parameters = {
    'model_name': 'DT',
    'estimator': DecisionTreeClassifier(random_state=42),
    'param_grid': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 5, 10]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_custom_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: DT
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Najlepsze parametry dla DT: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 20}
Najlepsza dokładność (CV) dla DT: 0.5267
Model DT zapisany jako: ../models/dataset_2\DT_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.5056


In [21]:
models_and_parameters = {
    'model_name': 'RF',
    'estimator': RandomForestClassifier(random_state=42),
    'param_grid': {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_custom_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: RF
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Najlepsze parametry dla RF: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 50}
Najlepsza dokładność (CV) dla RF: 0.5314
Model RF zapisany jako: ../models/dataset_2\RF_best_model_custom_dataset2.pkl
Dokładność na zbiorze testowym: 0.4956


In [22]:
models_and_parameters = {
    'model_name': 'NB',
    'estimator': GaussianNB(),
    'param_grid': {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_custom_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: NB
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Najlepsze parametry dla NB: {'var_smoothing': 1e-07}
Najlepsza dokładność (CV) dla NB: 0.4433
Model NB zapisany jako: ../models/dataset_2\NB_best_model_custom_dataset2.pkl
Dokładność na zbiorze testowym: 0.4356


In [23]:
models_and_parameters = {
    'model_name': 'KNN',
    'estimator': KNeighborsClassifier(),
    'param_grid': {
        'n_neighbors': [5, 10, 20],
        'metric': ['minkowski', 'euclidean', 'manhattan'],
        'weights': ['uniform', 'distance']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_custom_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: KNN
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Najlepsze parametry dla KNN: {'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'uniform'}
Najlepsza dokładność (CV) dla KNN: 0.4995
Model KNN zapisany jako: ../models/dataset_2\KNN_best_model_custom_dataset2.pkl
Dokładność na zbiorze testowym: 0.4811


In [24]:
models_and_parameters = {
    'model_name': 'SVM',
    'estimator': SVC(random_state=42),
    'param_grid': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'decision_function_shape': ['ovo', 'ovr']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_custom_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: SVM
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Najlepsze parametry dla SVM: {'C': 10, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}
Najlepsza dokładność (CV) dla SVM: 0.4652
Model SVM zapisany jako: ../models/dataset_2\SVM_best_model_custom_dataset2.pkl
Dokładność na zbiorze testowym: 0.4544


In [26]:
models_and_parameters = {
    'model_name': 'MLP',
        'estimator': MLPClassifier(random_state=42, early_stopping=True, n_iter_no_change=5),
        'param_grid': {
            'hidden_layer_sizes': [(100,), (50,50), (100,50,25)],
            'activation': ['relu', 'tanh', 'logistic'],
            'solver': ['adam', 'sgd', 'lbfgs'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive', 'invscaling']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_custom_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: MLP
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Najlepsze parametry dla MLP: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100, 50, 25), 'learning_rate': 'constant', 'solver': 'lbfgs'}
Najlepsza dokładność (CV) dla MLP: 0.5471
Model MLP zapisany jako: ../models/dataset_2\MLP_best_model_custom_dataset2.pkl
Dokładność na zbiorze testowym: 0.5367


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
