In [1]:
import joblib
import numpy as np
import os
import spacy
import pandas as pd

from gensim.models.callbacks import CallbackAny2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

In [2]:
df_csv = pd.read_csv("../data/dataset_2/AIGTxt_DataSet.csv")
df = pd.melt(df_csv, id_vars=['Domain'], value_vars=['Human-generated text', 'ChatGPT-generated text', 'Mixed text'],
                  var_name='Class', value_name='Text')
df.head()

Unnamed: 0,Domain,Class,Text
0,Astrophysics and Astronomy,Human-generated text,Low-thrust electric propulsion offers a higher...
1,Astrophysics and Astronomy,Human-generated text,Relative navigation is one of the key enable t...
2,Astrophysics and Astronomy,Human-generated text,"In recent years, the technology of space tethe..."
3,Astrophysics and Astronomy,Human-generated text,"In celestial mechanics and mathematics, dynami..."
4,Astrophysics and Astronomy,Human-generated text,The study of attitude dynamics within a three-...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Domain  3000 non-null   object
 1   Class   3000 non-null   object
 2   Text    3000 non-null   object
dtypes: object(3)
memory usage: 70.4+ KB


In [4]:
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    doc = nlp(text.lower())
    tokens = [token.text for token in doc if token.is_alpha or token.is_digit]
    return tokens

In [5]:
df['tokens'] = df['Text'].apply(clean_text)
df.head()

Unnamed: 0,Domain,Class,Text,tokens
0,Astrophysics and Astronomy,Human-generated text,Low-thrust electric propulsion offers a higher...,"[low, thrust, electric, propulsion, offers, a,..."
1,Astrophysics and Astronomy,Human-generated text,Relative navigation is one of the key enable t...,"[relative, navigation, is, one, of, the, key, ..."
2,Astrophysics and Astronomy,Human-generated text,"In recent years, the technology of space tethe...","[in, recent, years, the, technology, of, space..."
3,Astrophysics and Astronomy,Human-generated text,"In celestial mechanics and mathematics, dynami...","[in, celestial, mechanics, and, mathematics, d..."
4,Astrophysics and Astronomy,Human-generated text,The study of attitude dynamics within a three-...,"[the, study, of, attitude, dynamics, within, a..."


In [6]:
df.to_csv('../data/dataset_2/preprocessed_df_doc2vec.csv', index=False)

In [7]:
df_train, df_test = train_test_split(df, train_size=0.7, random_state=42, stratify=df['Class'])

df_train.shape, df_test.shape

((2100, 4), (900, 4))

In [8]:
class EpochLogger(CallbackAny2Vec):
    """Callback do śledzenia treningu"""
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print(f"Start epoki {self.epoch}")

    def on_epoch_end(self, model):
        print(f"Koniec epoki {self.epoch}")
        self.epoch += 1


epoch_logger = EpochLogger()

In [9]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_train['tokens'])]

doc2vec_model = Doc2Vec(vector_size=200, window=10, epochs=200, min_count=1, dm=1, workers=4)
doc2vec_model.build_vocab(documents)
doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs, callbacks=[epoch_logger])

Start epoki 0
Koniec epoki 0
Start epoki 1
Koniec epoki 1
Start epoki 2
Koniec epoki 2
Start epoki 3
Koniec epoki 3
Start epoki 4
Koniec epoki 4
Start epoki 5
Koniec epoki 5
Start epoki 6
Koniec epoki 6
Start epoki 7
Koniec epoki 7
Start epoki 8
Koniec epoki 8
Start epoki 9
Koniec epoki 9
Start epoki 10
Koniec epoki 10
Start epoki 11
Koniec epoki 11
Start epoki 12
Koniec epoki 12
Start epoki 13
Koniec epoki 13
Start epoki 14
Koniec epoki 14
Start epoki 15
Koniec epoki 15
Start epoki 16
Koniec epoki 16
Start epoki 17
Koniec epoki 17
Start epoki 18
Koniec epoki 18
Start epoki 19
Koniec epoki 19
Start epoki 20
Koniec epoki 20
Start epoki 21
Koniec epoki 21
Start epoki 22
Koniec epoki 22
Start epoki 23
Koniec epoki 23
Start epoki 24
Koniec epoki 24
Start epoki 25
Koniec epoki 25
Start epoki 26
Koniec epoki 26
Start epoki 27
Koniec epoki 27
Start epoki 28
Koniec epoki 28
Start epoki 29
Koniec epoki 29
Start epoki 30
Koniec epoki 30
Start epoki 31
Koniec epoki 31
Start epoki 32
Koniec epoki 

In [10]:
doc2vec_model_path = '../models/dataset_2/doc2vec_model_ds2.model'
doc2vec_model.save(doc2vec_model_path)

In [11]:
def get_doc_vector(model, tokens):
    return model.infer_vector(tokens)

def extract_features(df_, model):
    features = df_['tokens'].apply(lambda tokens: get_doc_vector(model, tokens))
    X = np.vstack(features.values)
    return X

In [12]:
X_train = extract_features(df_train, doc2vec_model)
X_test = extract_features(df_test, doc2vec_model)

In [13]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Class'])
y_test = label_encoder.transform(df_test['Class'])

joblib.dump(label_encoder, '../models/dataset_2/label_encoder_d2v.joblib')
print("Klasy:", label_encoder.classes_)

Klasy: ['ChatGPT-generated text' 'Human-generated text' 'Mixed text']


In [14]:
models_and_parameters = {
    'model_name': 'LR',
    'estimator': LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=42),
    'param_grid': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['saga', 'lbfgs']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_d2v_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: LR
Fitting 5 folds for each of 12 candidates, totalling 60 fits


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\weran\PycharmProjects\magisterka\venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\weran\PycharmProjects\

Najlepsze parametry dla LR: {'C': 0.1, 'penalty': 'l2', 'solver': 'saga'}
Najlepsza dokładność (CV) dla LR: 0.5748
Model LR zapisany jako: ../models/dataset_2\LR_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.5589


In [15]:
models_and_parameters = {
    'model_name': 'DT',
    'estimator': DecisionTreeClassifier(random_state=42),
    'param_grid': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 5, 10]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_d2v_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: DT
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Najlepsze parametry dla DT: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 2}
Najlepsza dokładność (CV) dla DT: 0.3948
Model DT zapisany jako: ../models/dataset_2\DT_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.4000


In [16]:
models_and_parameters = {
    'model_name': 'RF',
    'estimator': RandomForestClassifier(random_state=42),
    'param_grid': {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_d2v_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: RF
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Najlepsze parametry dla RF: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 200}
Najlepsza dokładność (CV) dla RF: 0.4724
Model RF zapisany jako: ../models/dataset_2\RF_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.4978


In [17]:
models_and_parameters = {
    'model_name': 'NB',
    'estimator': GaussianNB(),
    'param_grid': {
    'var_smoothing': [1e-9, 1e-8, 1e-7]
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_d2v_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: NB
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Najlepsze parametry dla NB: {'var_smoothing': 1e-09}
Najlepsza dokładność (CV) dla NB: 0.5419
Model NB zapisany jako: ../models/dataset_2\NB_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.3700


In [18]:
models_and_parameters = {
    'model_name': 'KNN',
    'estimator': KNeighborsClassifier(),
    'param_grid': {
        'n_neighbors': [5, 10, 20],
        'metric': ['minkowski', 'euclidean', 'manhattan'],
        'weights': ['uniform', 'distance']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_d2v_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: KNN
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Najlepsze parametry dla KNN: {'metric': 'minkowski', 'n_neighbors': 20, 'weights': 'uniform'}
Najlepsza dokładność (CV) dla KNN: 0.3348
Model KNN zapisany jako: ../models/dataset_2\KNN_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.4233


In [19]:
models_and_parameters = {
    'model_name': 'SVM',
    'estimator': SVC(random_state=42),
    'param_grid': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'decision_function_shape': ['ovo', 'ovr']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_d2v_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: SVM
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Najlepsze parametry dla SVM: {'C': 0.1, 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear'}
Najlepsza dokładność (CV) dla SVM: 0.5652
Model SVM zapisany jako: ../models/dataset_2\SVM_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.5778


In [23]:
models_and_parameters = {
    'model_name': 'XGB',
    'estimator': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'param_grid': {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.5, 0.75, 1.0],
        'colsample_bytree': [0.5, 0.75, 1.0],
        'objective': ['multi:softmax', 'multi:softprob']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_d2v_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')

In [21]:
models_and_parameters = {
    'model_name': 'MLP',
        'estimator': MLPClassifier(random_state=42, early_stopping=True, n_iter_no_change=5),
        'param_grid': {
            'hidden_layer_sizes': [(100,), (50,50), (100,50,25)],
            'activation': ['relu', 'tanh', 'logistic'],
            'solver': ['adam', 'sgd', 'lbfgs'],
            'alpha': [0.0001, 0.001, 0.01],
            'learning_rate': ['constant', 'adaptive', 'invscaling']
    }
}

model_name = models_and_parameters['model_name']

print(f"\nPrzetwarzanie modelu: {model_name}")
estimator = models_and_parameters['estimator']
param_grid = models_and_parameters['param_grid']

grid = GridSearchCV(estimator, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

print(f"Najlepsze parametry dla {model_name}: {grid.best_params_}")
print(f"Najlepsza dokładność (CV) dla {model_name}: {grid.best_score_:.4f}")

model_filename = os.path.join('../models/dataset_2', f'{model_name}_best_model_d2v_dataset2.pkl')
joblib.dump(grid.best_estimator_, model_filename)
print(f"Model {model_name} zapisany jako: {model_filename}")

best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Dokładność na zbiorze testowym: {accuracy:.4f}')


Przetwarzanie modelu: MLP
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Najlepsze parametry dla MLP: {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}
Najlepsza dokładność (CV) dla MLP: 0.6148
Model MLP zapisany jako: ../models/dataset_2\MLP_best_model_d2v_dataset2.pkl
Dokładność na zbiorze testowym: 0.5900
