# **AG_News**

Данный код базируется на коде автора: https://github.com/marco-siino/text_preprocessing_impact/blob/main/20N_DS/LR_20N_TextPreProImpact_NB.ipynb. Научная статья разработчика указана в списке использованной литературы и специально дублируется здесь: https://www.sciencedirect.com/science/article/pii/S0306437923001783?ref=cra_js_challenge&fr=RR-1

Здесь воспроизводится эксперимент с изменениями в процессе загрузки датасетов и некоторыми вытекающими правками в коде с целью получить оптимальные комбинации для моделей и датасетов, обозначенных в работе.

# Импорт модулей

In [1]:
import matplotlib.pyplot as plt
import ast
import os
import random
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import nltk
import pandas as pd

from numpy.random import seed
import numpy as np
from pathlib import Path
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import TextBlob

from datasets import load_dataset
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import recall_score

from collections import defaultdict

nltk.download('stopwords')
nltk.download('punkt')

os.environ['TF_CUDNN_DETERMINISTIC']='true'
os.environ['TF_DETERMINISTIC_OPS']='true'

import textblob.download_corpora as dl
dl.download_all()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/stepan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/stepan/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_

# Загружаем датасет

In [2]:
test_dir = '../data/raw/ag_news/test.csv'
train_dir = '../data/raw/ag_news/train.csv'

In [3]:
test_df = pd.read_csv(test_dir)
train_df = pd.read_csv(train_dir)

# Создаем обучающую выборку

In [4]:
X_train_text = train_df["text"].astype(str).values
y_train = train_df["label"].values
X_test_text = test_df["text"].astype(str).values
y_test = test_df["label"].values

In [5]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train))
test_ds = tf.data.Dataset.from_tensor_slices((X_test_text, y_test))

# Функции предобработки текста

In [6]:
# Do-Nothing preprocessing function.
def DON(input_data):
  tag_open_CDATA_removed = tf.strings.regex_replace(input_data, r'<\!\[CDATA\[', ' ')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed, r'\]{1,}>', ' ')
  tag_author_lang_en_removed = tf.strings.regex_replace(tag_closed_CDATA_removed,'', ' ')
  tag_closed_author_removed = tf.strings.regex_replace(tag_author_lang_en_removed,'', ' ')
  tag_open_documents_removed = tf.strings.regex_replace(tag_closed_author_removed, r'\n(\t){0,2}', '')
  output_data = tf.strings.regex_replace(tag_open_documents_removed, r'\n(\t){0,2}', ' ')
  return output_data

# Lowercasing preprocessing function.
def LOW(input_data):
  return tf.strings.lower(DON(input_data))

# Removing Stop Words function.
def RSW(input_data):
  output_data = DON(input_data)

  try:
    input_string=output_data[0]

  except:
    input_string=output_data

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [word for word in blob if word not in stopwords.words('english')]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant(output_string)

    return output_tensor

  else:

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [word for word in blob if word not in stopwords.words('english')]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant([[output_string]])

    return output_tensor

  return output_data

# Porter Stemmer preprocessing function.
def STM(input_data):
  output_data = DON(input_data)
  stemmer = PorterStemmer()

  try:
    input_string=output_data[0]

  except:
    input_string=output_data

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [stemmer.stem(word) for word in blob]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant(output_string)

    return output_tensor

  else:

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [stemmer.stem(word) for word in blob]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant([[output_string]])

    return output_tensor

  return output_data

In [7]:
## SECTION WITH PAIRS OF PREPRO FUNCTIONS. APPLICATION ORDER MATTERS (...IN FOLLOWING SECTIONS TOO).
#...5
def LOW_RSW(input_data):
  return RSW(LOW(input_data))

# 6
def LOW_STM(input_data):
  return STM(LOW(input_data))

# 7
def RSW_LOW(input_data):
  return LOW(RSW(input_data))

# 8
def RSW_STM(input_data):
  return STM(RSW(input_data))

# 9
def STM_LOW(input_data):
  return LOW(STM(input_data))

# 10
def STM_RSW(input_data):
  return RSW(STM(input_data))

# 11
def LOW_STM_RSW(input_data):
  return RSW(STM(LOW(input_data)))

# 12
def LOW_RSW_STM(input_data):
  return STM(RSW(LOW(input_data)))

# 13
def STM_LOW_RSW(input_data):
  return RSW(LOW(STM(input_data)))

# 14
def STM_RSW_LOW(input_data):
  return LOW(RSW(STM(input_data)))

# 15
def RSW_LOW_STM(input_data):
  return STM(LOW(RSW(input_data)))

# 16
def RSW_STM_LOW(input_data):
  return LOW(STM(RSW(input_data)))

In [8]:
max_features = 0
def preprocess_and_adapt_ts(preprocessing_function,training_set):
  # Set a large sequence length to find the longest sample in the training set.
  sequence_length = 15000
  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length,
      encoding='ISO-8859-1')

  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)

  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(vectorize_layer)

  longest_sample_length=1

  for element in training_set:
    authorDocument=element[0]
    label=element[1]

    author_batch = tf.expand_dims(authorDocument, axis=0)
    out = model(author_batch)
    # Convert token list to numpy array.
    token_list = out.numpy()[0]
    token_list = np.trim_zeros(token_list,'b')
    if longest_sample_length < len(token_list):
      longest_sample_length = len(token_list)

  print("Length of the longest sample is:", longest_sample_length)

  # After tokenization longest_sample_length covers all the document lenghts in our dataset.
  sequence_length = longest_sample_length

  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length,
      encoding='ISO-8859-1')

  # Finally adapt the vectorize layer.
  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  global max_features
  max_features=len(vectorize_layer.get_vocabulary()) + 1
  return vectorize_layer

In [9]:
model_results = defaultdict(lambda: defaultdict(list))
prepro_functions_dict_base = {
    'DON':DON,
    'LOW':LOW,
    'RSW':RSW,
    'STM':STM
    }

# 3 prepro functions = 15 combs...+1 for do_nothing

prepro_functions_dict_comb = {
    # 1. Do nothing
    'DON': DON,
    # 2. Lowercasing
    'LOW':LOW,
    # 3. Removing Stopwords
    'RSW':RSW,
    # 4. Porter Stemming
    'STM':STM,
    # 5. LOW->RSW
    'LOW_RSW':LOW_RSW,
    # 6. LOW->STM
    'LOW_STM':LOW_STM,
    # 7. RSW->LOW
    'RSW_LOW':RSW_LOW,
    # 8. RSW->STM
    'RSW_STM':RSW_STM,
    # 9. STM->LOW
    'STM_LOW':STM_LOW,
    # 10. STM->RSW
    'STM_RSW':STM_RSW,
    # 11. LOW->STM->RSW
    'LOW_STM_RSW':LOW_STM_RSW,
    # 12. LOW->RSW->STM
    'LOW_RSW_STM':LOW_RSW_STM,
    # 13. STM->LOW->RSW
    'STM_LOW_RSW':STM_LOW_RSW,
    # 14. STM->RSW->LOW
    'STM_RSW_LOW':STM_RSW_LOW,
    # 15. RSW->LOW->STM
    'RSW_LOW_STM':RSW_LOW_STM,
    # 16. RSW->STM->LOW
    'RSW_STM_LOW':RSW_STM_LOW
}

In [10]:
for key in prepro_functions_dict_comb:
    print("\n\n* * * * EVALUATION USING", key, "AS PREPROCESSING FUNCTION * * * *")

    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test_text, y_test)).shuffle(buffer_size=len(test_df), seed=1, reshuffle_each_iteration=False).batch(1).take(50)

    # Preprocess training set to build a dictionary.
    vectorize_layer = preprocess_and_adapt_ts(prepro_functions_dict_comb[key],train_ds)
    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)
    print("\n\n***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******")
    # Print a raw and a preprocessed sample.
    for element in train_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)

      print("Sample considered is: ", author_batch)
      print("Preprocessed: ", str(prepro_functions_dict_comb[key](author_batch.numpy())))
      break

    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)

    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)

    training_labels=[]
    training_samples=[]

    max_features=len(vectorize_layer.get_vocabulary()) + 1

    for element in train_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)
        
      text_vect_out = vectorize_layer(author_batch)

      training_labels.append(label.numpy())
      current_sample=np.zeros(max_features)
      for current_token in text_vect_out[0][:].numpy():
        current_sample[current_token]+=1
      training_samples.append(current_sample)

    training_labels=np.array(training_labels)
    training_samples=np.array(training_samples)

    test_labels=[]
    test_samples=[]

    for element in test_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)

      text_vect_out = vectorize_layer(author_batch)

      test_labels.append(label.numpy())
      current_sample=np.zeros(max_features)
      for current_token in text_vect_out[0][:].numpy():
        current_sample[current_token]+=1
      test_samples.append(current_sample)

    test_labels=np.array(test_labels)
    test_samples=np.array(test_samples)

    models = {
        "NB": MultinomialNB(),
        "SVM": LinearSVC(random_state=0, max_iter=5000),
        "LR": LogisticRegression(random_state=0, max_iter=5000),
        "AdaBoost": AdaBoostClassifier(random_state=0),
        "XGBoost": XGBClassifier(random_state=0),
        "RandomForest": RandomForestClassifier(random_state=0),
        "DecisionTree": DecisionTreeClassifier(random_state=0)
    }

    for name, model in models.items():
        model.fit(training_samples, training_labels.ravel())
        acc = model.score(test_samples, test_labels)
        model_results[key][name].append(acc)
        print(f"{name} Accuracy on Test set ->", acc)



* * * * EVALUATION USING DON AS PREPROCESSING FUNCTION * * * *


2025-05-05 19:13:51.477912: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-05 19:13:53.004250: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 651


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   G   e   r   m   a   n       t   e   e   n       w   h   o       m   a   d   e       S   a   s   s   e   r       w   o   r   m       h   i   r   e   d       b   y       c   o   m   p   u   t   e   r       s   e   c   u   r   i   t   y       f   i   r   m       (   A   F   P   )       A   F   P       -       A       G   e   r   m   a   n       t   e   e   n   a   g   e   r       a   c   c   u   s   e   d       o   f       c   r   e   a   t   i   n   g       t   h   e       S   a   s   s   e   r 

2025-05-05 19:13:53.597390: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


NB Accuracy on Test set -> 0.44
SVM Accuracy on Test set -> 0.42
LR Accuracy on Test set -> 0.48
AdaBoost Accuracy on Test set -> 0.36
XGBoost Accuracy on Test set -> 0.38
RandomForest Accuracy on Test set -> 0.38
DecisionTree Accuracy on Test set -> 0.26


* * * * EVALUATION USING LOW AS PREPROCESSING FUNCTION * * * *




Length of the longest sample is: 651


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   g   e   r   m   a   n       t   e   e   n       w   h   o       m   a   d   e       s   a   s   s   e   r       w   o   r   m       h   i   r   e   d       b   y       c   o   m   p   u   t   e   r       s   e   c   u   r   i   t   y       f   i   r   m       (   a   f   p   )       a   f   p       -       a       g   e   r   m   a   n       t   e   e   n   a   g   e   r       a   c   c   u   s   e   d       o   f       c   r   e   a   t   i   n   g       t   h   e       s   a   s   s   e   r 

2025-05-05 19:13:55.886487: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


NB Accuracy on Test set -> 0.36
SVM Accuracy on Test set -> 0.44
LR Accuracy on Test set -> 0.46
AdaBoost Accuracy on Test set -> 0.32
XGBoost Accuracy on Test set -> 0.38
RandomForest Accuracy on Test set -> 0.36
DecisionTree Accuracy on Test set -> 0.32


* * * * EVALUATION USING RSW AS PREPROCESSING FUNCTION * * * *




Length of the longest sample is: 346


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'G e r n e e n w h e S e r w r h r e b c p u e r e c u r f r A F P A F P A G e r n e e n g e r c c u e f c r e n g h e S e r w r h n f e c e l l n f c p u e r r u n h e w r l b e n g u g h b e c e e c u r f w r e p r g r e r h e c p n h h r e h']], shape=(1, 1), dtype=string)
NB Accuracy on Test set -> 0.42
SVM Accuracy on Test set -> 0.42
LR Accuracy on Test set -> 0.46
AdaBoost Accuracy on Test set -> 0.32




XGBoost Accuracy on Test set -> 0.46
RandomForest Accuracy on Test set -> 0.44
DecisionTree Accuracy on Test set -> 0.38


* * * * EVALUATION USING STM AS PREPROCESSING FUNCTION * * * *


2025-05-05 19:14:04.273095: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 634


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r m a n t e e n w h o m a d e s a s s e r w o r m h i r e d b y c o m p u t e r s e c u r i t y f i r m a f p a f p a g e r m a n t e e n a g e r a c c u s e d o f c r e a t i n g t h e s a s s e r w o r m t h a t i n f e c t e d m i l l i o n s o f c o m p u t e r s a r o u n d t h e w o r l d i s b e i n g t a u g h t t o b e c o m e a s e c u r i t y s o f t w a r e p r o g r a m m e r t h e c o m p a n y t h a t h i r e d h i m s a i d']], shape=(1, 1), dtype=string)
NB Accuracy on Test s



XGBoost Accuracy on Test set -> 0.34
RandomForest Accuracy on Test set -> 0.32
DecisionTree Accuracy on Test set -> 0.38


* * * * EVALUATION USING LOW_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 327


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r n e e n w h e e r w r h r e b c p u e r e c u r f r f p f p g e r n e e n g e r c c u e f c r e n g h e e r w r h n f e c e l l n f c p u e r r u n h e w r l b e n g u g h b e c e e c u r f w r e p r g r e r h e c p n h h r e h']], shape=(1, 1), dtype=string)
NB Accuracy on Test set -> 0.3
SVM Accuracy o



Length of the longest sample is: 634


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r m a n t e e n w h o m a d e s a s s e r w o r m h i r e d b y c o m p u t e r s e c u r i t y f i r m a f p a f p a g e r m a n t e e n a g e r a c c u s e d o f c r e a t i n g t h e s a s s e r w o r m t h a t i n f e c t e d m i l l i o n s o f c o m p u t e r s a r o u n d t h e w o r l d i s b e i n g t a u g h t t o b e c o m e a s e c u r i t y s o f t w a r e p r o g r a m m e r t h e c o m p a n y t h a t h i r e d h i m s a i d']], shape=(1, 1), dtype=string)
NB Accuracy on Test s



XGBoost Accuracy on Test set -> 0.34
RandomForest Accuracy on Test set -> 0.32
DecisionTree Accuracy on Test set -> 0.38


* * * * EVALUATION USING RSW_LOW AS PREPROCESSING FUNCTION * * * *


2025-05-05 19:14:22.356990: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 346


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   g       e       r       n       e       e       n       w       h       e       s       e       r       w       r       h       r       e       b       c       p       u       e       r       e       c       u       r       f       r       a       f       p       a       f       p       a       g       e       r       n       e       e       n       g       e       r       c       c       u       e       f       c       r       e       n       g       h       e       s       e       r       w 



RandomForest Accuracy on Test set -> 0.44
DecisionTree Accuracy on Test set -> 0.26


* * * * EVALUATION USING RSW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 346


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r n e e n w h e s e r w r h r e b c p u e r e c u r f r a f p a f p a g e r n e e n g e r c c u e f c r e n g h e s e r w r h n f e c e l l n f c p u e r r u n h e w r l b e n g u g h b e c e e c u r f w r e p r g r e r h e c p n h h r e h']], shape=(1, 1), dtype=string)
NB Accuracy on Test set -> 0.34
SVM Accuracy on Test set -> 0.4
LR Accur



RandomForest Accuracy on Test set -> 0.44
DecisionTree Accuracy on Test set -> 0.26


* * * * EVALUATION USING STM_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 634


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   g       e       r       m       a       n       t       e       e       n       w       h       o       m       a       d       e       s       a       s       s       e       r       w       o       r       m       h       i       r       e       d       b       y       c       o       m       p       u       t       e       r       s       e



XGBoost Accuracy on Test set -> 0.34
RandomForest Accuracy on Test set -> 0.32
DecisionTree Accuracy on Test set -> 0.38


* * * * EVALUATION USING STM_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 327


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r n e e n w h e e r w r h r e b c p u e r e c u r f r f p f p g e r n e e n g e r c c u e f c r e n g h e e r w r h n f e c e l l n f c p u e r r u n h e w r l b e n g u g h b e c e e c u r f w r e p r g r e r h e c p n h h r e h']], shape=(1, 1), dtype=string)
NB Accuracy on Test set -> 0.3
SVM Accuracy o



Length of the longest sample is: 327


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r n e e n w h e e r w r h r e b c p u e r e c u r f r f p f p g e r n e e n g e r c c u e f c r e n g h e e r w r h n f e c e l l n f c p u e r r u n h e w r l b e n g u g h b e c e e c u r f w r e p r g r e r h e c p n h h r e h']], shape=(1, 1), dtype=string)
NB Accuracy on Test set -> 0.3
SVM Accuracy on Test set -> 0.38
LR Accuracy on Test set -> 0.4
AdaBoost Accuracy on Test set -> 0.42
XGBoost Accuracy on Test set -> 0.36
RandomForest Accuracy on Test set -> 0.4
DecisionTree Accuracy on



Length of the longest sample is: 327


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r n e e n w h e e r w r h r e b c p u e r e c u r f r f p f p g e r n e e n g e r c c u e f c r e n g h e e r w r h n f e c e l l n f c p u e r r u n h e w r l b e n g u g h b e c e e c u r f w r e p r g r e r h e c p n h h r e h']], shape=(1, 1), dtype=string)
NB Accuracy on Test set -> 0.3
SVM Accuracy on Test set -> 0.38
LR Accuracy on Test set -> 0.4
AdaBoost Accuracy on Test set -> 0.42
XGBoost Accuracy on Test set -> 0.36
RandomForest Accuracy on Test set -> 0.4
DecisionTree Accuracy on



Length of the longest sample is: 327


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r n e e n w h e e r w r h r e b c p u e r e c u r f r f p f p g e r n e e n g e r c c u e f c r e n g h e e r w r h n f e c e l l n f c p u e r r u n h e w r l b e n g u g h b e c e e c u r f w r e p r g r e r h e c p n h h r e h']], shape=(1, 1), dtype=string)


2025-05-05 19:15:08.625658: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


NB Accuracy on Test set -> 0.3
SVM Accuracy on Test set -> 0.38
LR Accuracy on Test set -> 0.4
AdaBoost Accuracy on Test set -> 0.42
XGBoost Accuracy on Test set -> 0.36




RandomForest Accuracy on Test set -> 0.4
DecisionTree Accuracy on Test set -> 0.22


* * * * EVALUATION USING STM_RSW_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 327


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   g       e       r       n       e       e       n       w       h       e       e       r       w       r       h       r       e       b       c       p       u       e       r       e       c       u       r       f       r       f       p       f       p       g       e       r       n       e       e       n       g       e       r     



Length of the longest sample is: 346


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'g e r n e e n w h e s e r w r h r e b c p u e r e c u r f r a f p a f p a g e r n e e n g e r c c u e f c r e n g h e s e r w r h n f e c e l l n f c p u e r r u n h e w r l b e n g u g h b e c e e c u r f w r e p r g r e r h e c p n h h r e h']], shape=(1, 1), dtype=string)
NB Accuracy on Test set -> 0.34
SVM Accuracy on Test set -> 0.4
LR Accuracy on Test set -> 0.44
AdaBoost Accuracy on Test set -> 0.44
XGBoost Accuracy on Test set -> 0.44




RandomForest Accuracy on Test set -> 0.44
DecisionTree Accuracy on Test set -> 0.26


* * * * EVALUATION USING RSW_STM_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 346


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b'German teen who made Sasser worm hired by computer security firm (AFP) AFP - A German teenager accused of creating the Sasser worm that infected millions of computers around the world is being taught to become a security software programmer, the company that hired him said.']], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   g       e       r       n       e       e       n       w       h       e       s       e       r       w       r       h       r       e       b       c       p       u       e       r       e       c       u       r       f       r       a       f       p       a       f       p       a       g       e       r       n       e       e    



# Результаты

In [11]:
best_result_per_model = {}

print("PREPRO FUNCTION         | MODEL NAME        | ACCURANCY   ")
print("-" * 60)

for prepro_func in prepro_functions_dict_comb:
    for model_name, scores in model_results[prepro_func].items():
        acc = round(scores[-1], 4)
        if model_name not in best_result_per_model or acc > best_result_per_model[model_name][1]:
            best_result_per_model[model_name] = (prepro_func, acc)
        print(f"{prepro_func:23} | {model_name:17} | {acc:>6.4f}")
    print("-" * 60)

PREPRO FUNCTION         | MODEL NAME        | ACCURANCY   
------------------------------------------------------------
DON                     | NB                | 0.4400
DON                     | SVM               | 0.4200
DON                     | LR                | 0.4800
DON                     | AdaBoost          | 0.3600
DON                     | XGBoost           | 0.3800
DON                     | RandomForest      | 0.3800
DON                     | DecisionTree      | 0.2600
------------------------------------------------------------
LOW                     | NB                | 0.3600
LOW                     | SVM               | 0.4400
LOW                     | LR                | 0.4600
LOW                     | AdaBoost          | 0.3200
LOW                     | XGBoost           | 0.3800
LOW                     | RandomForest      | 0.3600
LOW                     | DecisionTree      | 0.3200
------------------------------------------------------------
RSW             

In [12]:
print("MODEL NAME         | BEST PREPRO FUNCTION   | MAX ACCURACY")
print("-" * 60)
for model_name, (best_func, acc) in best_result_per_model.items():
    print(f"{model_name:18} | {best_func:22} | {acc:.4f}")

MODEL NAME         | BEST PREPRO FUNCTION   | MAX ACCURACY
------------------------------------------------------------
NB                 | DON                    | 0.4400
SVM                | LOW                    | 0.4400
LR                 | DON                    | 0.4800
AdaBoost           | RSW_LOW                | 0.4400
XGBoost            | RSW                    | 0.4600
RandomForest       | RSW                    | 0.4400
DecisionTree       | RSW                    | 0.3800


# Сохраним результаты в отдельную таблицу

In [13]:
prepo_path = "../reports/preprocessing_combinations/ag_news/"
os.makedirs(prepo_path, exist_ok=True)

In [14]:
rows = []

for prepo_func, model_scores in model_results.items():
    for model_name, acc in model_scores.items():
        rows.append({
            "prepo_func": prepo_func,
            "model": model_name,
            "accuracy": round(acc[-1], 4)
        })

df_full = pd.DataFrame(rows)
df_full.to_csv(f"{prepo_path}full.csv", index=False)

In [15]:
df_best = pd.DataFrame([
    {"model": model, "best_prepo_func": func, "max_accuracy": acc}
    for model, (func, acc) in best_result_per_model.items()
])
df_best.to_csv(f"{prepo_path}best.csv", index=False)