In [1]:
import joblib
import numpy as np
import os
import spacy
import pandas as pd

from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
# from xgboost import XGBClassifier

In [2]:
df_init = pd.read_excel('../data/dataset_1/ieee-init.xlsx')
df_init['Class'] = 'Human-generated text'

df_polish = pd.read_excel('../data/dataset_1/ieee-chatgpt-polish.xlsx')
df_polish['Class'] = 'ChatGPT-polish text'

df_generation = pd.read_excel('../data/dataset_1/ieee-chatgpt-generation.xlsx')
df_generation['Class'] = 'ChatGPT-generated text'

df_fusion = pd.read_excel('../data/dataset_1/ieee-chatgpt-fusion.xlsx')
df_fusion['Class'] = 'Mixed text'

df = pd.concat([df_init, df_generation, df_polish, df_fusion], ignore_index=True)
df.head()

Unnamed: 0.2,id,title,keyword,abstract,Class,Unnamed: 0.1,Unnamed: 0,index
0,8600003,An Improved Variable-Node-Based BP Decoding Al...,"""Flash memories"",""Reliability"",""Decoding"",""Par...",To solve the problems of the data reliability ...,Human-generated text,,,
1,8600004,Mobile Robot Location Algorithm Based on Impro...,"""Sociology"",""Statistics"",""Simultaneous localiz...",To solve the simultaneous localization and map...,Human-generated text,,,
2,8600008,Vertical Handoff Decision Algorithm for Hetero...,"""Entropy"",""Handover"",""Wireless networks"",""Deci...",In the future scenario of multiple wireless ne...,Human-generated text,,,
3,8600013,Robust offline trained neural network for TDOA...,"""Microphones"",""Artificial neural networks"",""Po...",Passive sound source localization (SSL) using ...,Human-generated text,,,
4,8600014,Gaussian MAC with Feedback and Strictly Causal...,"""Encoding"",""Transmitters"",""Decoding"",""Indexes""...",We consider a two user Gaussian multiple acces...,Human-generated text,,,


In [3]:
df = df.drop(columns=['index', 'Unnamed: 0', 'Unnamed: 0.1'])
df = df.rename(columns={'abstract': 'Text'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50699 entries, 0 to 50698
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       50699 non-null  int64 
 1   title    50699 non-null  object
 2   keyword  50699 non-null  object
 3   Text     50699 non-null  object
 4   Class    50699 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.9+ MB


In [5]:
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha or token.is_digit]
    return tokens

In [6]:
df['tokens'] = df['Text'].apply(clean_text)
df.head()

Unnamed: 0,id,title,keyword,Text,Class,tokens
0,8600003,An Improved Variable-Node-Based BP Decoding Al...,"""Flash memories"",""Reliability"",""Decoding"",""Par...",To solve the problems of the data reliability ...,Human-generated text,"[to, solve, the, problems, of, the, data, reli..."
1,8600004,Mobile Robot Location Algorithm Based on Impro...,"""Sociology"",""Statistics"",""Simultaneous localiz...",To solve the simultaneous localization and map...,Human-generated text,"[to, solve, the, simultaneous, localization, a..."
2,8600008,Vertical Handoff Decision Algorithm for Hetero...,"""Entropy"",""Handover"",""Wireless networks"",""Deci...",In the future scenario of multiple wireless ne...,Human-generated text,"[in, the, future, scenario, of, multiple, wire..."
3,8600013,Robust offline trained neural network for TDOA...,"""Microphones"",""Artificial neural networks"",""Po...",Passive sound source localization (SSL) using ...,Human-generated text,"[passive, sound, source, localization, ssl, us..."
4,8600014,Gaussian MAC with Feedback and Strictly Causal...,"""Encoding"",""Transmitters"",""Decoding"",""Indexes""...",We consider a two user Gaussian multiple acces...,Human-generated text,"[we, consider, a, two, user, gaussian, multipl..."


In [7]:
df.to_csv('../data/dataset_1/preprocessed_df_doc2vec.csv', index=False)

In [8]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=42, stratify=df['Class'])

df_train.shape, df_test.shape

((35489, 6), (15210, 6))

In [9]:
class EpochLogger(CallbackAny2Vec):
    """Callback do śledzenia treningu"""
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print(f"Start epoki {self.epoch}")

    def on_epoch_end(self, model):
        print(f"Koniec epoki {self.epoch}")
        self.epoch += 1


epoch_logger = EpochLogger()

In [10]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_train['tokens'])]

doc2vec_model = Doc2Vec(vector_size=200, window=10, epochs=200, min_count=1, dm=1, workers=4)
doc2vec_model.build_vocab(documents)
doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs, callbacks=[epoch_logger])

Start epoki 0
Koniec epoki 0
Start epoki 1
Koniec epoki 1
Start epoki 2
Koniec epoki 2
Start epoki 3
Koniec epoki 3
Start epoki 4
Koniec epoki 4
Start epoki 5
Koniec epoki 5
Start epoki 6
Koniec epoki 6
Start epoki 7
Koniec epoki 7
Start epoki 8
Koniec epoki 8
Start epoki 9
Koniec epoki 9
Start epoki 10
Koniec epoki 10
Start epoki 11
Koniec epoki 11
Start epoki 12
Koniec epoki 12
Start epoki 13
Koniec epoki 13
Start epoki 14
Koniec epoki 14
Start epoki 15
Koniec epoki 15
Start epoki 16
Koniec epoki 16
Start epoki 17
Koniec epoki 17
Start epoki 18
Koniec epoki 18
Start epoki 19
Koniec epoki 19
Start epoki 20
Koniec epoki 20
Start epoki 21
Koniec epoki 21
Start epoki 22
Koniec epoki 22
Start epoki 23
Koniec epoki 23
Start epoki 24
Koniec epoki 24
Start epoki 25
Koniec epoki 25
Start epoki 26
Koniec epoki 26
Start epoki 27
Koniec epoki 27
Start epoki 28
Koniec epoki 28
Start epoki 29
Koniec epoki 29
Start epoki 30
Koniec epoki 30
Start epoki 31
Koniec epoki 31
Start epoki 32
Koniec epoki 

In [11]:
doc2vec_model_path = '../models/dataset_1/doc2vec_model_ds1.model'
doc2vec_model.save(doc2vec_model_path)

In [12]:
def get_doc_vector(model, tokens):
    return model.infer_vector(tokens)

def extract_features(df_, model):
    features = df_['tokens'].apply(lambda tokens: get_doc_vector(model, tokens))
    X = np.vstack(features.values)
    return X

In [13]:
X_train = extract_features(df_train, doc2vec_model)
X_test = extract_features(df_test, doc2vec_model)

In [14]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Class'])
y_test = label_encoder.transform(df_test['Class'])

joblib.dump(label_encoder, '../models/dataset_1/label_encoder_d2v.joblib')
print("Klasy:", label_encoder.classes_)

Klasy: ['ChatGPT-generated text' 'ChatGPT-polish text' 'Human-generated text'
 'Mixed text']


In [15]:
models_and_parameters = {
    'model_name': 'LR',
    'estimator': LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=42),
    'param_grid': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['saga', 'lbfgs']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_d2v_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: LR
Fitting 5 folds for each of 12 candidates, totalling 60 fits


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/weronika.najda/PycharmProjects/ml_dataset2/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/weronika.najda/PycharmProjects/ml_dataset2/venv/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/weronika.najda/PycharmProjects/ml_dataset2/venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1193, in f

Najlepsze parametry dla LR: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Najlepsza dokładność (CV) dla LR: 0.7582
Model LR zapisany jako: ../models/dataset_1/LR_best_model_d2v_dataset1.pkl
Dokładność na zbiorze testowym: 0.7491


In [18]:
models_and_parameters = {
    'model_name': 'DT',
    'estimator': DecisionTreeClassifier(random_state=42),
    'param_grid': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 5, 10]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_d2v_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: DT
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Najlepsze parametry dla DT: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Najlepsza dokładność (CV) dla DT: 0.4506
Model DT zapisany jako: ../models/dataset_1/DT_best_model_d2v_dataset1.pkl
Dokładność na zbiorze testowym: 0.4426


In [19]:
models_and_parameters = {
    'model_name': 'RF',
    'estimator': RandomForestClassifier(random_state=42),
    'param_grid': {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_d2v_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: RF
Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END .....................C=0.1, penalty=l1, solver=saga; total time= 2.3min
[CV] END .......................C=1, penalty=l2, solver=saga; total time= 1.8min
[CV] END ......................C=10, penalty=l2, solver=saga; total time= 1.4min
[CV] END ................................var_smoothing=1e-08; total time=   0.4s
[CV] END ..metric=minkowski, n_neighbors=5, weights=distance; total time=   6.3s
[CV] END ..metric=minkowski, n_neighbors=10, weights=uniform; total time=   5.8s
[CV] END .metric=minkowski, n_neighbors=20, weights=distance; total time=   6.2s
[CV] END ..metric=euclidean, n_neighbors=5, weights=distance; total time=   6.0s
[CV] END .metric=euclidean, n_neighbors=10, weights=distance; total time=   6.1s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time= 1.1min
[CV] END ..metric=manhattan, n_neighbors=10, weights=uniform; total time= 1.0min
[CV] END ..metric=ma



[CV] END .....................C=0.1, penalty=l1, solver=saga; total time= 2.3min
[CV] END .......................C=1, penalty=l2, solver=saga; total time= 1.6min
[CV] END ......................C=10, penalty=l2, solver=saga; total time= 1.3min
[CV] END ................................var_smoothing=1e-09; total time=   0.4s
[CV] END ...metric=minkowski, n_neighbors=5, weights=uniform; total time=   6.4s
[CV] END .metric=minkowski, n_neighbors=10, weights=distance; total time=   5.8s
[CV] END .metric=minkowski, n_neighbors=20, weights=distance; total time=   6.2s
[CV] END ..metric=euclidean, n_neighbors=10, weights=uniform; total time=   6.6s
[CV] END ..metric=euclidean, n_neighbors=20, weights=uniform; total time=   6.8s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time= 1.1min
[CV] END ..metric=manhattan, n_neighbors=20, weights=uniform; total time= 1.0min
[CV] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=  21.2s
[CV] END

In [16]:
models_and_parameters = {
    'model_name': 'NB',
    'estimator': GaussianNB(),
    'param_grid': {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_d2v_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: NB
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Najlepsze parametry dla NB: {'var_smoothing': 1e-09}
Najlepsza dokładność (CV) dla NB: 0.6341
Model NB zapisany jako: ../models/dataset_1/NB_best_model_d2v_dataset1.pkl
Dokładność na zbiorze testowym: 0.5233


In [17]:
models_and_parameters = {
    'model_name': 'KNN',
    'estimator': KNeighborsClassifier(),
    'param_grid': {
        'n_neighbors': [5, 10, 20],
        'metric': ['minkowski', 'euclidean', 'manhattan'],
        'weights': ['uniform', 'distance']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_d2v_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: KNN
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Najlepsze parametry dla KNN: {'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'uniform'}
Najlepsza dokładność (CV) dla KNN: 0.3081
Model KNN zapisany jako: ../models/dataset_1/KNN_best_model_d2v_dataset1.pkl
Dokładność na zbiorze testowym: 0.3606


In [None]:
models_and_parameters = {
    'model_name': 'SVM',
    'estimator': SVC(random_state=42),
    'param_grid': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'decision_function_shape': ['ovo', 'ovr']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_d2v_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')

In [22]:
models_and_parameters = {
    'model_name': 'MLP',
        'estimator': MLPClassifier(random_state=42, early_stopping=True, n_iter_no_change=5),
        'param_grid': {
            'hidden_layer_sizes': [(100,), (50,50), (100,50,25)],
            'activation': ['relu', 'tanh', 'logistic'],
            'solver': ['adam', 'sgd', 'lbfgs'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive', 'invscaling']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_1', f'{model_name}_best_model_d2v_dataset1.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: MLP
Fitting 5 folds for each of 243 candidates, totalling 1215 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Najlepsze parametry dla MLP: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (100, 50, 25), 'learning_rate': 'constant', 'solver': 'adam'}
Najlepsza dokładność (CV) dla MLP: 0.7584
Model MLP zapisany jako: ../models/dataset_1/MLP_best_model_d2v_dataset1.pkl
Dokładność na zbiorze testowym: 0.7210
