# **20newsgroups**

Данный код базируется на коде автора: https://github.com/marco-siino/text_preprocessing_impact/blob/main/20N_DS/LR_20N_TextPreProImpact_NB.ipynb. Научная статья разработчика указана в списке использованной литературы и специально дублируется здесь: https://www.sciencedirect.com/science/article/pii/S0306437923001783?ref=cra_js_challenge&fr=RR-1 

Здесь воспроизводится эксперимент с изменениями в процессе загрузки датасетов и некоторыми вытекающими правками в коде с целью получить оптимальные комбинации для моделей и датасетов, обозначенных в работе. 

# Импорт модулей

In [1]:
import matplotlib.pyplot as plt
import ast
import os
import random
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import nltk
import pandas as pd

from numpy.random import seed
import numpy as np
from pathlib import Path
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import TextBlob

from datasets import load_dataset
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import recall_score

from collections import defaultdict

nltk.download('stopwords')
nltk.download('punkt')

os.environ['TF_CUDNN_DETERMINISTIC']='true'
os.environ['TF_DETERMINISTIC_OPS']='true'

import textblob.download_corpora as dl
dl.download_all()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/stepan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/stepan/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_

# Загружаем датасет

In [2]:
test_dir = '../data/raw/20newsgroups/test.csv'
train_dir = '../data/raw/20newsgroups/train.csv'

In [3]:
test_df = pd.read_csv(test_dir)
train_df = pd.read_csv(train_dir)

# Создаем обучающую выборку

In [4]:
X_train_text = train_df["text"].astype(str).values
y_train = train_df["label"].values
X_test_text = test_df["text"].astype(str).values
y_test = test_df["label"].values

In [5]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train))
test_ds = tf.data.Dataset.from_tensor_slices((X_test_text, y_test))

# Функции предобработки текста

In [6]:
# Do-Nothing preprocessing function.
def DON(input_data):
  tag_open_CDATA_removed = tf.strings.regex_replace(input_data, r'<\!\[CDATA\[', ' ')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed, r'\]{1,}>', ' ')
  tag_author_lang_en_removed = tf.strings.regex_replace(tag_closed_CDATA_removed,'', ' ')
  tag_closed_author_removed = tf.strings.regex_replace(tag_author_lang_en_removed,'', ' ')
  tag_open_documents_removed = tf.strings.regex_replace(tag_closed_author_removed, r'\n(\t){0,2}', '')
  output_data = tf.strings.regex_replace(tag_open_documents_removed, r'\n(\t){0,2}', ' ')
  return output_data

# Lowercasing preprocessing function.
def LOW(input_data):
  return tf.strings.lower(DON(input_data))

# Removing Stop Words function.
def RSW(input_data):
  output_data = DON(input_data)

  try:
    input_string=output_data[0]

  except:
    input_string=output_data

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [word for word in blob if word not in stopwords.words('english')]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant(output_string)

    return output_tensor

  else:

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [word for word in blob if word not in stopwords.words('english')]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant([[output_string]])

    return output_tensor

  return output_data

# Porter Stemmer preprocessing function.
def STM(input_data):
  output_data = DON(input_data)
  stemmer = PorterStemmer()

  try:
    input_string=output_data[0]

  except:
    input_string=output_data

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [stemmer.stem(word) for word in blob]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant(output_string)

    return output_tensor

  else:

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [stemmer.stem(word) for word in blob]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant([[output_string]])

    return output_tensor

  return output_data

In [7]:
## SECTION WITH PAIRS OF PREPRO FUNCTIONS. APPLICATION ORDER MATTERS (...IN FOLLOWING SECTIONS TOO).
#...5
def LOW_RSW(input_data):
  return RSW(LOW(input_data))

# 6
def LOW_STM(input_data):
  return STM(LOW(input_data))

# 7
def RSW_LOW(input_data):
  return LOW(RSW(input_data))

# 8
def RSW_STM(input_data):
  return STM(RSW(input_data))

# 9
def STM_LOW(input_data):
  return LOW(STM(input_data))

# 10
def STM_RSW(input_data):
  return RSW(STM(input_data))

# 11
def LOW_STM_RSW(input_data):
  return RSW(STM(LOW(input_data)))

# 12
def LOW_RSW_STM(input_data):
  return STM(RSW(LOW(input_data)))

# 13
def STM_LOW_RSW(input_data):
  return RSW(LOW(STM(input_data)))

# 14
def STM_RSW_LOW(input_data):
  return LOW(RSW(STM(input_data)))

# 15
def RSW_LOW_STM(input_data):
  return STM(LOW(RSW(input_data)))

# 16
def RSW_STM_LOW(input_data):
  return LOW(STM(RSW(input_data)))

In [8]:
max_features = 0
def preprocess_and_adapt_ts(preprocessing_function,training_set):
  # Set a large sequence length to find the longest sample in the training set.
  sequence_length = 15000
  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length,
      encoding='ISO-8859-1')

  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)

  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(vectorize_layer)

  longest_sample_length=1

  for element in training_set:
    authorDocument=element[0]
    label=element[1]

    author_batch = tf.expand_dims(authorDocument, axis=0)
    out = model(author_batch)
    # Convert token list to numpy array.
    token_list = out.numpy()[0]
    token_list = np.trim_zeros(token_list,'b')
    if longest_sample_length < len(token_list):
      longest_sample_length = len(token_list)

  print("Length of the longest sample is:", longest_sample_length)

  # After tokenization longest_sample_length covers all the document lenghts in our dataset.
  sequence_length = longest_sample_length

  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length,
      encoding='ISO-8859-1')

  # Finally adapt the vectorize layer.
  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  global max_features
  max_features=len(vectorize_layer.get_vocabulary()) + 1
  return vectorize_layer

In [9]:
model_results = defaultdict(lambda: defaultdict(list))
prepro_functions_dict_base = {
    'DON':DON,
    'LOW':LOW,
    'RSW':RSW,
    'STM':STM
    }

# 3 prepro functions = 15 combs...+1 for do_nothing

prepro_functions_dict_comb = {
    # 1. Do nothing
    'DON': DON,
    # 2. Lowercasing
    'LOW':LOW,
    # 3. Removing Stopwords
    'RSW':RSW,
    # 4. Porter Stemming
    'STM':STM,
    # 5. LOW->RSW
    'LOW_RSW':LOW_RSW,
    # 6. LOW->STM
    'LOW_STM':LOW_STM,
    # 7. RSW->LOW
    'RSW_LOW':RSW_LOW,
    # 8. RSW->STM
    'RSW_STM':RSW_STM,
    # 9. STM->LOW
    'STM_LOW':STM_LOW,
    # 10. STM->RSW
    'STM_RSW':STM_RSW,
    # 11. LOW->STM->RSW
    'LOW_STM_RSW':LOW_STM_RSW,
    # 12. LOW->RSW->STM
    'LOW_RSW_STM':LOW_RSW_STM,
    # 13. STM->LOW->RSW
    'STM_LOW_RSW':STM_LOW_RSW,
    # 14. STM->RSW->LOW
    'STM_RSW_LOW':STM_RSW_LOW,
    # 15. RSW->LOW->STM
    'RSW_LOW_STM':RSW_LOW_STM,
    # 16. RSW->STM->LOW
    'RSW_STM_LOW':RSW_STM_LOW
}

In [10]:
for key in prepro_functions_dict_comb:
    print("\n\n* * * * EVALUATION USING", key, "AS PREPROCESSING FUNCTION * * * *")

    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test_text, y_test)).shuffle(buffer_size=len(test_df), seed=1, reshuffle_each_iteration=False).batch(1).take(50)

    # Preprocess training set to build a dictionary.
    vectorize_layer = preprocess_and_adapt_ts(prepro_functions_dict_comb[key],train_ds)

    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)
    print("\n\n***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******")
    # Print a raw and a preprocessed sample.
    for element in train_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)

      print("Sample considered is: ", author_batch)
      print("Preprocessed: ", str(prepro_functions_dict_comb[key](author_batch.numpy())))
      break

    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)

    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)

    training_labels=[]
    training_samples=[]

    max_features=len(vectorize_layer.get_vocabulary()) + 1

    for element in train_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)
        
      text_vect_out = vectorize_layer(author_batch)

      training_labels.append(label.numpy()[0].decode("utf-8"))
      current_sample=np.zeros(max_features)
      for current_token in text_vect_out[0][:].numpy():
        current_sample[current_token]+=1
      training_samples.append(current_sample)

    training_labels=np.array(training_labels)
    training_samples=np.array(training_samples)

    test_labels=[]
    test_samples=[]

    for element in test_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)

      text_vect_out = vectorize_layer(author_batch)

      test_labels.append(label.numpy()[0].decode("utf-8"))
      current_sample=np.zeros(max_features)
      for current_token in text_vect_out[0][:].numpy():
        current_sample[current_token]+=1
      test_samples.append(current_sample)

    test_labels=np.array(test_labels)
    test_samples=np.array(test_samples)

    models = {
        "NB": MultinomialNB(),
        "SVM": LinearSVC(random_state=0, max_iter=5000),
        "LR": LogisticRegression(random_state=0, max_iter=5000),
        "AdaBoost": AdaBoostClassifier(random_state=0),
        "XGBoost": XGBClassifier(random_state=0),
        "RandomForest": RandomForestClassifier(random_state=0),
        "DecisionTree": DecisionTreeClassifier(random_state=0)
    }

    label_encoder = LabelEncoder()
    training_labels_enc = label_encoder.fit_transform(training_labels)
    test_labels_enc = label_encoder.transform(test_labels)

    for name, model in models.items():
        model.fit(training_samples, training_labels_enc)
        preds = model.predict(test_samples)
        
        acc = model.score(test_samples, test_labels_enc)
        recall = recall_score(test_labels_enc, preds, average='macro')
        
        model_results[key][name].append(acc)
        
        print(f"{name} Accuracy on Test set ->", acc)
        print(f"{name} Recall on Test set ->", recall)
    



* * * * EVALUATION USING DON AS PREPROCESSING FUNCTION * * * *


2025-05-05 19:17:27.184055: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-05 19:17:28.675224: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 3038


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:incoming/geode08.gif\n:   ics.uci.edu:incoming/geode09.gif\n:   ics.uci.edu:incoming/geode10.gif\n:   ics.uci.edu:incoming/geode11.gif\n:   ics.uci.edu:incoming/geode12.gif\n:   ics.uci.edu:incoming/geode13.gif\n:   ics.uci.edu:incoming/geode14.gif\n:   ics.uci.edu:i

2025-05-05 19:17:29.109952: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.14
NB Recall on Test set -> 0.13666666666666666
SVM Accuracy on Test set -> 0.3
SVM Recall on Test set -> 0.2508333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.24
LR Recall on Test set -> 0.22456140350877193
AdaBoost Accuracy on Test set -> 0.04
AdaBoost Recall on Test set -> 0.07407407407407407


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBoost Accuracy on Test set -> 0.22
XGBoost Recall on Test set -> 0.12083333333333335
RandomForest Accuracy on Test set -> 0.28
RandomForest Recall on Test set -> 0.2263157894736842
DecisionTree Accuracy on Test set -> 0.12
DecisionTree Recall on Test set -> 0.07999999999999999


* * * * EVALUATION USING LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 3038


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.ed

2025-05-05 19:17:33.564263: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.08
NB Recall on Test set -> 0.095
SVM Accuracy on Test set -> 0.22
SVM Recall on Test set -> 0.19649122807017544


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.14
LR Recall on Test set -> 0.135
AdaBoost Accuracy on Test set -> 0.14
AdaBoost Recall on Test set -> 0.14259259259259258


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBoost Accuracy on Test set -> 0.14
XGBoost Recall on Test set -> 0.12333333333333334
RandomForest Accuracy on Test set -> 0.18
RandomForest Recall on Test set -> 0.1789473684210526
DecisionTree Accuracy on Test set -> 0.18
DecisionTree Recall on Test set -> 0.18833333333333332


* * * * EVALUATION USING RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1982


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.ed

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.2
NB Recall on Test set -> 0.21583333333333332
SVM Accuracy on Test set -> 0.2
SVM Recall on Test set -> 0.1475


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.2
LR Recall on Test set -> 0.1725
AdaBoost Accuracy on Test set -> 0.04
AdaBoost Recall on Test set -> 0.07407407407407407
XGBoost Accuracy on Test set -> 0.22
XGBoost Recall on Test set -> 0.1408333333333333


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.24
RandomForest Recall on Test set -> 0.18166666666666667
DecisionTree Accuracy on Test set -> 0.1
DecisionTree Recall on Test set -> 0.07368421052631578


* * * * EVALUATION USING STM AS PREPROCESSING FUNCTION * * * *


2025-05-05 19:17:59.130167: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 2419


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:incoming/geode08.gif\n:   ics.uci.edu:incoming/geode09.gif\n:   ics.uci.edu:incoming/geode10.gif\n:   ics.uci.edu:incoming/geode11.gif\n:   ics.uci.edu:incoming/geode12.gif\n:   ics.uci.edu:incoming/geode13.gif\n:   ics.uci.edu:incoming/geode14.gif\n:   ics.uci.edu:i

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.08
NB Recall on Test set -> 0.09824561403508772
SVM Accuracy on Test set -> 0.14
SVM Recall on Test set -> 0.10166666666666666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.12
AdaBoost Accuracy on Test set -> 0.08
AdaBoost Recall on Test set -> 0.05925925925925926
XGBoost Accuracy on Test set -> 0.16
XGBoost Recall on Test set -> 0.12


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.18
RandomForest Recall on Test set -> 0.12166666666666666
DecisionTree Accuracy on Test set -> 0.18
DecisionTree Recall on Test set -> 0.16333333333333333


* * * * EVALUATION USING LOW_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1731


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:inco

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.18
NB Recall on Test set -> 0.15499999999999997
SVM Accuracy on Test set -> 0.18
SVM Recall on Test set -> 0.1280701754385965


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.13508771929824562
AdaBoost Accuracy on Test set -> 0.04
AdaBoost Recall on Test set -> 0.06


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBoost Accuracy on Test set -> 0.14
XGBoost Recall on Test set -> 0.08666666666666666
RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.16315789473684209
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.12833333333333333


* * * * EVALUATION USING LOW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 2419


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.u

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.08
NB Recall on Test set -> 0.09824561403508772
SVM Accuracy on Test set -> 0.14
SVM Recall on Test set -> 0.10166666666666666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.12
AdaBoost Accuracy on Test set -> 0.08
AdaBoost Recall on Test set -> 0.05925925925925926
XGBoost Accuracy on Test set -> 0.16
XGBoost Recall on Test set -> 0.12


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.18
RandomForest Recall on Test set -> 0.12166666666666666
DecisionTree Accuracy on Test set -> 0.18
DecisionTree Recall on Test set -> 0.16333333333333333


* * * * EVALUATION USING RSW_LOW AS PREPROCESSING FUNCTION * * * *


2025-05-05 19:18:43.535671: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 1982


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:incoming/geode08.gif\n:   ics.uci.edu:incoming/geode09.gif\n:   ics.uci.edu:incoming/geode10.gif\n:   ics.uci.edu:incoming/geode11.gif\n:   ics.uci.edu:incoming/geode12.gif\n:   ics.uci.edu:incoming/geode13.gif\n:   ics.uci.edu:incoming/geode14.gif\n:   ics.uci.edu:i

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.14
NB Recall on Test set -> 0.095
SVM Accuracy on Test set -> 0.1
SVM Recall on Test set -> 0.09649122807017543


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.22
LR Recall on Test set -> 0.19912280701754384
AdaBoost Accuracy on Test set -> 0.1
AdaBoost Recall on Test set -> 0.0807017543859649
XGBoost Accuracy on Test set -> 0.12
XGBoost Recall on Test set -> 0.08684210526315789


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.1807017543859649
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.07999999999999999


* * * * EVALUATION USING RSW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1982


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:incom

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.14
NB Recall on Test set -> 0.095
SVM Accuracy on Test set -> 0.1
SVM Recall on Test set -> 0.09649122807017543


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.22
LR Recall on Test set -> 0.19912280701754384
AdaBoost Accuracy on Test set -> 0.1
AdaBoost Recall on Test set -> 0.0807017543859649
XGBoost Accuracy on Test set -> 0.12
XGBoost Recall on Test set -> 0.08684210526315789


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.1807017543859649
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.07999999999999999


* * * * EVALUATION USING STM_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 2419


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:incom

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.08
NB Recall on Test set -> 0.09824561403508772
SVM Accuracy on Test set -> 0.14
SVM Recall on Test set -> 0.10166666666666666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.12
AdaBoost Accuracy on Test set -> 0.08
AdaBoost Recall on Test set -> 0.05925925925925926
XGBoost Accuracy on Test set -> 0.16
XGBoost Recall on Test set -> 0.12


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.18
RandomForest Recall on Test set -> 0.12166666666666666
DecisionTree Accuracy on Test set -> 0.18
DecisionTree Recall on Test set -> 0.16333333333333333


* * * * EVALUATION USING STM_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1731


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:inco

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.18
NB Recall on Test set -> 0.15499999999999997
SVM Accuracy on Test set -> 0.18
SVM Recall on Test set -> 0.1280701754385965


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.13508771929824562
AdaBoost Accuracy on Test set -> 0.04
AdaBoost Recall on Test set -> 0.06


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBoost Accuracy on Test set -> 0.14
XGBoost Recall on Test set -> 0.08666666666666666
RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.16315789473684209
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.12833333333333333


* * * * EVALUATION USING LOW_STM_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1731


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   i

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.18
NB Recall on Test set -> 0.15499999999999997
SVM Accuracy on Test set -> 0.18
SVM Recall on Test set -> 0.1280701754385965


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.13508771929824562
AdaBoost Accuracy on Test set -> 0.04
AdaBoost Recall on Test set -> 0.06
XGBoost Accuracy on Test set -> 0.14
XGBoost Recall on Test set -> 0.08666666666666666


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.16315789473684209
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.12833333333333333


* * * * EVALUATION USING LOW_RSW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1731


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.18
NB Recall on Test set -> 0.15499999999999997
SVM Accuracy on Test set -> 0.18
SVM Recall on Test set -> 0.1280701754385965


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.13508771929824562
AdaBoost Accuracy on Test set -> 0.04
AdaBoost Recall on Test set -> 0.06
XGBoost Accuracy on Test set -> 0.14
XGBoost Recall on Test set -> 0.08666666666666666


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.16315789473684209
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.12833333333333333


* * * * EVALUATION USING STM_LOW_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1731


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:

2025-05-05 19:21:00.483808: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.18
NB Recall on Test set -> 0.15499999999999997
SVM Accuracy on Test set -> 0.18
SVM Recall on Test set -> 0.1280701754385965


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.13508771929824562
AdaBoost Accuracy on Test set -> 0.04
AdaBoost Recall on Test set -> 0.06
XGBoost Accuracy on Test set -> 0.14
XGBoost Recall on Test set -> 0.08666666666666666


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.16315789473684209
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.12833333333333333


* * * * EVALUATION USING STM_RSW_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1731


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.18
NB Recall on Test set -> 0.15499999999999997
SVM Accuracy on Test set -> 0.18
SVM Recall on Test set -> 0.1280701754385965


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.18
LR Recall on Test set -> 0.13508771929824562
AdaBoost Accuracy on Test set -> 0.04
AdaBoost Recall on Test set -> 0.06
XGBoost Accuracy on Test set -> 0.14
XGBoost Recall on Test set -> 0.08666666666666666


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.16315789473684209
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.12833333333333333


* * * * EVALUATION USING RSW_LOW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1982


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ics.uci.edu:incoming/geode06.gif\n:   ics.uci.edu:incoming/geode07.gif\n:   ics.uci.edu:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.14
NB Recall on Test set -> 0.095
SVM Accuracy on Test set -> 0.1
SVM Recall on Test set -> 0.09649122807017543


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.22
LR Recall on Test set -> 0.19912280701754384
AdaBoost Accuracy on Test set -> 0.1
AdaBoost Recall on Test set -> 0.0807017543859649


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XGBoost Accuracy on Test set -> 0.12
XGBoost Recall on Test set -> 0.08684210526315789
RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.1807017543859649
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.07999999999999999


* * * * EVALUATION USING RSW_STM_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 1982


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b': : I have 19 (2 MB worth!) uuencode\'d GIF images contain charts outlining\n: : one of the many alternative Space Station designs being considered in\n: : Crystal City.  [...]\n\n: I just posted the GIF files out for anonymous FTP on server ics.uci.edu.\n: You can retrieve them from:\n:   ics.uci.edu:incoming/geode01.gif\n:   ics.uci.edu:incoming/geode02.gif\n:   ics.uci.edu:incoming/geode03.gif\n:   ics.uci.edu:incoming/geode04.gif\n:   ics.uci.edu:incoming/geode05.gif\n:   ic

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


NB Accuracy on Test set -> 0.14
NB Recall on Test set -> 0.095
SVM Accuracy on Test set -> 0.1
SVM Recall on Test set -> 0.09649122807017543


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LR Accuracy on Test set -> 0.22
LR Recall on Test set -> 0.19912280701754384
AdaBoost Accuracy on Test set -> 0.1
AdaBoost Recall on Test set -> 0.0807017543859649
XGBoost Accuracy on Test set -> 0.12
XGBoost Recall on Test set -> 0.08684210526315789
RandomForest Accuracy on Test set -> 0.22
RandomForest Recall on Test set -> 0.1807017543859649
DecisionTree Accuracy on Test set -> 0.14
DecisionTree Recall on Test set -> 0.07999999999999999


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Результаты

In [11]:
best_result_per_model = {}

print("PREPRO FUNCTION         | MODEL NAME        | ACCURANCY   ")
print("-" * 60)

for prepro_func in prepro_functions_dict_comb:
    for model_name, scores in model_results[prepro_func].items():
        acc = round(scores[-1], 4)
        if model_name not in best_result_per_model or acc > best_result_per_model[model_name][1]:
            best_result_per_model[model_name] = (prepro_func, acc)
        print(f"{prepro_func:23} | {model_name:17} | {acc:>6.4f}")
    print("-" * 60)

PREPRO FUNCTION         | MODEL NAME        | ACCURANCY   
------------------------------------------------------------
DON                     | NB                | 0.1400
DON                     | SVM               | 0.3000
DON                     | LR                | 0.2400
DON                     | AdaBoost          | 0.0400
DON                     | XGBoost           | 0.2200
DON                     | RandomForest      | 0.2800
DON                     | DecisionTree      | 0.1200
------------------------------------------------------------
LOW                     | NB                | 0.0800
LOW                     | SVM               | 0.2200
LOW                     | LR                | 0.1400
LOW                     | AdaBoost          | 0.1400
LOW                     | XGBoost           | 0.1400
LOW                     | RandomForest      | 0.1800
LOW                     | DecisionTree      | 0.1800
------------------------------------------------------------
RSW             

In [12]:
print("MODEL NAME         | BEST PREPRO FUNCTION   | MAX ACCURACY")
print("-" * 60)
for model_name, (best_func, acc) in best_result_per_model.items():
    print(f"{model_name:18} | {best_func:22} | {acc:.4f}")

MODEL NAME         | BEST PREPRO FUNCTION   | MAX ACCURACY
------------------------------------------------------------
NB                 | RSW                    | 0.2000
SVM                | DON                    | 0.3000
LR                 | DON                    | 0.2400
AdaBoost           | LOW                    | 0.1400
XGBoost            | DON                    | 0.2200
RandomForest       | DON                    | 0.2800
DecisionTree       | LOW                    | 0.1800


# Сохраним результаты в отдельную таблицу

In [13]:
prepo_path = "../reports/preprocessing_combinations/20newsgroups/"
os.makedirs(prepo_path, exist_ok=True)

In [14]:
rows = []

for prepo_func, model_scores in model_results.items():
    for model_name, acc in model_scores.items():
        rows.append({
            "prepo_func": prepo_func,
            "model": model_name,
            "accuracy": round(acc[-1], 4)
        })

df_full = pd.DataFrame(rows)
df_full.to_csv(f"{prepo_path}full.csv", index=False)

In [15]:
df_best = pd.DataFrame([
    {"model": model, "best_prepo_func": func, "max_accuracy": acc}
    for model, (func, acc) in best_result_per_model.items()
])
df_best.to_csv(f"{prepo_path}best.csv", index=False)