In [2]:
import joblib
import numpy as np
import os
import spacy
import pandas as pd

from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import ngrams
from scipy.stats import entropy
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

In [3]:
df_init = pd.read_excel('../data/dataset_1/ieee-init.xlsx')
df_init['Class'] = 'Human-generated text'

df_polish = pd.read_excel('../data/dataset_1/ieee-chatgpt-polish.xlsx')
df_polish['Class'] = 'ChatGPT-polish text'

df_generation = pd.read_excel('../data/dataset_1/ieee-chatgpt-generation.xlsx')
df_generation['Class'] = 'ChatGPT-generated text'

df_fusion = pd.read_excel('../data/dataset_1/ieee-chatgpt-fusion.xlsx')
df_fusion['Class'] = 'Mixed text'

df = pd.concat([df_init, df_generation, df_polish, df_fusion], ignore_index=True)
df.head()

Unnamed: 0.2,id,title,keyword,abstract,Class,Unnamed: 0.1,Unnamed: 0,index
0,8600003,An Improved Variable-Node-Based BP Decoding Al...,"""Flash memories"",""Reliability"",""Decoding"",""Par...",To solve the problems of the data reliability ...,Human-generated text,,,
1,8600004,Mobile Robot Location Algorithm Based on Impro...,"""Sociology"",""Statistics"",""Simultaneous localiz...",To solve the simultaneous localization and map...,Human-generated text,,,
2,8600008,Vertical Handoff Decision Algorithm for Hetero...,"""Entropy"",""Handover"",""Wireless networks"",""Deci...",In the future scenario of multiple wireless ne...,Human-generated text,,,
3,8600013,Robust offline trained neural network for TDOA...,"""Microphones"",""Artificial neural networks"",""Po...",Passive sound source localization (SSL) using ...,Human-generated text,,,
4,8600014,Gaussian MAC with Feedback and Strictly Causal...,"""Encoding"",""Transmitters"",""Decoding"",""Indexes""...",We consider a two user Gaussian multiple acces...,Human-generated text,,,


In [4]:
df = df.drop(columns=['index', 'Unnamed: 0', 'Unnamed: 0.1'])
df = df.rename(columns={'abstract': 'Text'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50699 entries, 0 to 50698
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       50699 non-null  int64 
 1   title    50699 non-null  object
 2   keyword  50699 non-null  object
 3   Text     50699 non-null  object
 4   Class    50699 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB


In [5]:
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc 
        if (token.is_alpha and not token.is_stop and len(token) > 2)
    ]
    return tokens

In [6]:
df['tokens'] = df['Text'].apply(clean_text)
df.head()

Unnamed: 0,id,title,keyword,Text,Class,tokens
0,8600003,An Improved Variable-Node-Based BP Decoding Al...,"""Flash memories"",""Reliability"",""Decoding"",""Par...",To solve the problems of the data reliability ...,Human-generated text,"[solve, problem, data, reliability, nand, flas..."
1,8600004,Mobile Robot Location Algorithm Based on Impro...,"""Sociology"",""Statistics"",""Simultaneous localiz...",To solve the simultaneous localization and map...,Human-generated text,"[solve, simultaneous, localization, mapping, s..."
2,8600008,Vertical Handoff Decision Algorithm for Hetero...,"""Entropy"",""Handover"",""Wireless networks"",""Deci...",In the future scenario of multiple wireless ne...,Human-generated text,"[future, scenario, multiple, wireless, network..."
3,8600013,Robust offline trained neural network for TDOA...,"""Microphones"",""Artificial neural networks"",""Po...",Passive sound source localization (SSL) using ...,Human-generated text,"[passive, sound, source, localization, ssl, ti..."
4,8600014,Gaussian MAC with Feedback and Strictly Causal...,"""Encoding"",""Transmitters"",""Decoding"",""Indexes""...",We consider a two user Gaussian multiple acces...,Human-generated text,"[consider, user, gaussian, multiple, access, c..."


In [7]:
def type_token_ratio(words):
    return len(set(words)) / len(words) if words else 0

def calculate_entropy(words):
    word_freq = Counter(words)
    probabilities = np.array(list(word_freq.values())) / len(words)
    return entropy(probabilities)

def perplexity(words):
    return np.exp(calculate_entropy(words))

def sentence_paragraph_variability(text, words):
    sentences = sent_tokenize(text)
    sentence_lengths = [len(word_tokenize(sent)) for sent in sentences]
    paragraphs = text.split("\n")
    paragraph_lengths = [len(word_tokenize(para)) for para in paragraphs if para.strip()]
    return np.std(sentence_lengths), np.std(paragraph_lengths)

def ngram_distribution(words, n=2):
    n_grams = list(ngrams(words, n))
    return Counter(n_grams).most_common(5)

def abstractness(words):
    abstract_words = {"concept", "idea", "theory", "model", "framework", "paradigm"}
    return len([word for word in words if word in abstract_words]) / len(words) if words else 0

def repetitiveness(words):
    word_freq = Counter(words)
    return max(word_freq.values()) / len(words) if words else 0

def syntactic_complexity(text):
    doc = nlp(text)
    complex_sentences = sum(1 for sent in doc.sents if sum(1 for token in sent if token.dep_ in ['advcl', 'ccomp', 'xcomp']) > 0)
    return complex_sentences / len(list(doc.sents)) if doc.sents else 0

def avg_word_sentence_length(words, text):
    sentences = sent_tokenize(text)
    avg_word_len = np.mean([len(word) for word in words]) if words else 0
    avg_sent_len = np.mean([len(word_tokenize(sent)) for sent in sentences]) if sentences else 0
    return avg_word_len, avg_sent_len

In [8]:
df['tokens'] = df['Text'].apply(clean_text)
df['ttr'] = df['tokens'].apply(type_token_ratio)
df['entropy'] = df['tokens'].apply(calculate_entropy)
df['perplexity'] = df['tokens'].apply(perplexity)
df['sentence_var'], df['paragraph_var'] = zip(*df.apply(
    lambda row: sentence_paragraph_variability(row['Text'], row['tokens']), axis=1))
df['abstractness'] = df['tokens'].apply(abstractness)
df['repetitiveness'] = df['tokens'].apply(repetitiveness)
df['syntactic_complexity'] = df['Text'].apply(syntactic_complexity)
df['avg_word_length'], df['avg_sentence_length'] = zip(*df.apply(
    lambda row: avg_word_sentence_length(row['tokens'], row['Text']), axis=1))

In [9]:
df.to_csv('../data/dataset_1/preprocessed_df_custom_features.csv', index=False)

In [10]:
df.head()

Unnamed: 0,id,title,keyword,Text,Class,tokens,ttr,entropy,perplexity,sentence_var,paragraph_var,abstractness,repetitiveness,syntactic_complexity,avg_word_length,avg_sentence_length
0,8600003,An Improved Variable-Node-Based BP Decoding Al...,"""Flash memories"",""Reliability"",""Decoding"",""Par...",To solve the problems of the data reliability ...,Human-generated text,"[solve, problem, data, reliability, nand, flas...",0.714286,3.891327,48.975853,8.01561,0.0,0.0,0.051948,1.0,6.974026,33.5
1,8600004,Mobile Robot Location Algorithm Based on Impro...,"""Sociology"",""Statistics"",""Simultaneous localiz...",To solve the simultaneous localization and map...,Human-generated text,"[solve, simultaneous, localization, mapping, s...",0.710145,3.761766,43.024356,7.303899,0.0,0.0,0.086957,0.857143,7.507246,22.714286
2,8600008,Vertical Handoff Decision Algorithm for Hetero...,"""Entropy"",""Handover"",""Wireless networks"",""Deci...",In the future scenario of multiple wireless ne...,Human-generated text,"[future, scenario, multiple, wireless, network...",0.76,3.963848,52.659546,7.605261,0.0,0.0,0.04,0.4,7.32,29.4
3,8600013,Robust offline trained neural network for TDOA...,"""Microphones"",""Artificial neural networks"",""Po...",Passive sound source localization (SSL) using ...,Human-generated text,"[passive, sound, source, localization, ssl, ti...",0.728395,3.939887,51.412767,5.314593,0.0,0.0,0.074074,0.285714,6.864198,21.571429
4,8600014,Gaussian MAC with Feedback and Strictly Causal...,"""Encoding"",""Transmitters"",""Decoding"",""Indexes""...",We consider a two user Gaussian multiple acces...,Human-generated text,"[consider, user, gaussian, multiple, access, c...",0.720588,3.730491,41.699563,4.611432,0.0,0.014706,0.073529,0.285714,7.205882,19.142857


In [11]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=42, stratify=df['Class'])

df_train.shape, df_test.shape

((35489, 16), (15210, 16))

In [12]:
feature_columns = ['ttr', 'entropy', 'perplexity', 'sentence_var', 'paragraph_var', 'abstractness', 'repetitiveness', 'syntactic_complexity', 'avg_word_length', 'avg_sentence_length']
X_train, X_test = df_train[feature_columns], df_test[feature_columns]

In [13]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Class'])
y_test = label_encoder.transform(df_test['Class'])

joblib.dump(label_encoder, '../models/dataset_1/label_encoder.joblib')
print("Klasy:", label_encoder.classes_)

Klasy: ['ChatGPT-generated text' 'ChatGPT-polish text' 'Human-generated text'
 'Mixed text']


In [14]:
models_and_parameters = {
    'model_name': 'LR',
    'estimator': LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=42),
    'param_grid': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['saga', 'lbfgs']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_custom_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: LR
Fitting 5 folds for each of 12 candidates, totalling 60 fits


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\weran\PycharmProjects\

Najlepsze parametry dla LR: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Najlepsza dokładność (CV) dla LR: 0.6369
Model LR zapisany jako: ../models/dataset_1\LR_best_model_custom_dataset1.pkl
Dokładność na zbiorze testowym: 0.6313


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
models_and_parameters = {
    'model_name': 'DT',
    'estimator': DecisionTreeClassifier(random_state=42),
    'param_grid': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 5, 10]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_custom_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: DT
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Najlepsze parametry dla DT: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}
Najlepsza dokładność (CV) dla DT: 0.6434
Model DT zapisany jako: ../models/dataset_1\DT_best_model_custom_dataset1.pkl
Dokładność na zbiorze testowym: 0.6388


In [16]:
models_and_parameters = {
    'model_name': 'RF',
    'estimator': RandomForestClassifier(random_state=42),
    'param_grid': {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_custom_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: RF
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Najlepsze parametry dla RF: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100}
Najlepsza dokładność (CV) dla RF: 0.6669
Model RF zapisany jako: ../models/dataset_1\RF_best_model_custom_dataset1.pkl
Dokładność na zbiorze testowym: 0.6594


In [17]:
models_and_parameters = {
    'model_name': 'NB',
    'estimator': GaussianNB(),
    'param_grid': {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_custom_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: NB
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Najlepsze parametry dla NB: {'var_smoothing': 1e-07}
Najlepsza dokładność (CV) dla NB: 0.4409
Model NB zapisany jako: ../models/dataset_1\NB_best_model_custom_dataset1.pkl
Dokładność na zbiorze testowym: 0.4385


In [18]:
models_and_parameters = {
    'model_name': 'KNN',
    'estimator': KNeighborsClassifier(),
    'param_grid': {
        'n_neighbors': [5, 10, 20],
        'metric': ['minkowski', 'euclidean', 'manhattan'],
        'weights': ['uniform', 'distance']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_custom_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: KNN
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Najlepsze parametry dla KNN: {'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'distance'}
Najlepsza dokładność (CV) dla KNN: 0.6322
Model KNN zapisany jako: ../models/dataset_1\KNN_best_model_custom_dataset1.pkl
Dokładność na zbiorze testowym: 0.6218


In [21]:
models_and_parameters = {
    'model_name': 'SVM',
    'estimator': SVC(random_state=42),
    'param_grid': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'decision_function_shape': ['ovo', 'ovr']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_custom_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: SVM
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Najlepsze parametry dla SVM: {'C': 10, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}
Najlepsza dokładność (CV) dla SVM: 0.6510
Model SVM zapisany jako: ../models/dataset_1\SVM_best_model_custom_dataset1.pkl
Dokładność na zbiorze testowym: 0.6428


In [20]:
models_and_parameters = {
    'model_name': 'MLP',
        'estimator': MLPClassifier(random_state=42, early_stopping=True, n_iter_no_change=5),
        'param_grid': {
            'hidden_layer_sizes': [(100,), (50,50), (100,50,25)],
            'activation': ['relu', 'tanh', 'logistic'],
            'solver': ['adam', 'sgd', 'lbfgs'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive', 'invscaling']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_custom_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: MLP
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Najlepsze parametry dla MLP: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50, 25), 'learning_rate': 'constant', 'solver': 'adam'}
Najlepsza dokładność (CV) dla MLP: 0.6604
Model MLP zapisany jako: ../models/dataset_1\MLP_best_model_custom_dataset1.pkl
Dokładność na zbiorze testowym: 0.6538
