# **TweetEval: Sentiment**

Данный код базируется на коде автора: https://github.com/marco-siino/text_preprocessing_impact/blob/main/20N_DS/LR_20N_TextPreProImpact_NB.ipynb. Научная статья разработчика указана в списке использованной литературы и специально дублируется здесь: https://www.sciencedirect.com/science/article/pii/S0306437923001783?ref=cra_js_challenge&fr=RR-1 

В курсовой работе уже используются предложенные комбинации методов предобработки для Логистической регрессии, Наивного Байесовского классификатора и Метода опорных векторов. Для отсутствующих моделей: XGBoost, AdaBoost, RandomForest, DecisionTree - эксперимент воспроизводится с изменениями в процессе загрузки датасетов и некоторыми вытекающими правками в коде. Использование оригинального кода позволяет получить недостающие данные и сохранить полноту курсовой работы. 

# Импорт модулей

In [1]:
import matplotlib.pyplot as plt
import os
import random
import re
import shutil
import string
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
import nltk
import pandas as pd

from numpy.random import seed
import numpy as np
from pathlib import Path
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from textblob import TextBlob

from datasets import load_dataset
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from collections import defaultdict

nltk.download('stopwords')
nltk.download('punkt')

os.environ['TF_CUDNN_DETERMINISTIC']='true'
os.environ['TF_DETERMINISTIC_OPS']='true'

import textblob.download_corpora as dl
dl.download_all()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/stepan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/stepan/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/stepan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/stepan/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_

# Загружаем датасет

In [4]:
test_dir = '../data/raw/tweet_eval_sentiment/test.csv'
train_dir = '../data/raw/tweet_eval_sentiment/train.csv'

In [5]:
test_df = pd.read_csv(test_dir)
train_df = pd.read_csv(train_dir)

# Создаем обучающую выборку

In [6]:
X_train_text = train_df["text"].astype(str).values
y_train = train_df["label"].values
X_test_text = test_df["text"].astype(str).values
y_test = test_df["label"].values

In [7]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train))
test_ds = tf.data.Dataset.from_tensor_slices((X_test_text, y_test))

# Функции предобработки текста

In [9]:
# Do-Nothing preprocessing function.
def DON(input_data):
  tag_open_CDATA_removed = tf.strings.regex_replace(input_data, r'<\!\[CDATA\[', ' ')
  tag_closed_CDATA_removed = tf.strings.regex_replace(tag_open_CDATA_removed, r'\]{1,}>', ' ')
  tag_author_lang_en_removed = tf.strings.regex_replace(tag_closed_CDATA_removed,'', ' ')
  tag_closed_author_removed = tf.strings.regex_replace(tag_author_lang_en_removed,'', ' ')
  tag_open_documents_removed = tf.strings.regex_replace(tag_closed_author_removed, r'\n(\t){0,2}', '')
  output_data = tf.strings.regex_replace(tag_open_documents_removed, r'\n(\t){0,2}', ' ')
  return output_data

# Lowercasing preprocessing function.
def LOW(input_data):
  return tf.strings.lower(DON(input_data))

# Removing Stop Words function.
def RSW(input_data):
  output_data = DON(input_data)

  try:
    input_string=output_data[0]

  except:
    input_string=output_data

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [word for word in blob if word not in stopwords.words('english')]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant(output_string)

    return output_tensor

  else:

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [word for word in blob if word not in stopwords.words('english')]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant([[output_string]])

    return output_tensor

  return output_data

# Porter Stemmer preprocessing function.
def STM(input_data):
  output_data = DON(input_data)
  stemmer = PorterStemmer()

  try:
    input_string=output_data[0]

  except:
    input_string=output_data

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [stemmer.stem(word) for word in blob]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant(output_string)

    return output_tensor

  else:

    try:
      input_string = input_string.numpy()

    except:
      return output_data

    else:
      input_string=(str(input_string))[2:-1]

    blob = TextBlob(str(input_string)).words
    outputlist = [stemmer.stem(word) for word in blob]
    output_string = (' '.join(word for word in outputlist))
    output_tensor=tf.constant([[output_string]])

    return output_tensor

  return output_data

In [10]:
## SECTION WITH PAIRS OF PREPRO FUNCTIONS. APPLICATION ORDER MATTERS (...IN FOLLOWING SECTIONS TOO).
#...5
def LOW_RSW(input_data):
  return RSW(LOW(input_data))

# 6
def LOW_STM(input_data):
  return STM(LOW(input_data))

# 7
def RSW_LOW(input_data):
  return LOW(RSW(input_data))

# 8
def RSW_STM(input_data):
  return STM(RSW(input_data))

# 9
def STM_LOW(input_data):
  return LOW(STM(input_data))

# 10
def STM_RSW(input_data):
  return RSW(STM(input_data))

# 11
def LOW_STM_RSW(input_data):
  return RSW(STM(LOW(input_data)))

# 12
def LOW_RSW_STM(input_data):
  return STM(RSW(LOW(input_data)))

# 13
def STM_LOW_RSW(input_data):
  return RSW(LOW(STM(input_data)))

# 14
def STM_RSW_LOW(input_data):
  return LOW(RSW(STM(input_data)))

# 15
def RSW_LOW_STM(input_data):
  return STM(LOW(RSW(input_data)))

# 16
def RSW_STM_LOW(input_data):
  return LOW(STM(RSW(input_data)))

In [11]:
max_features = 0
def preprocess_and_adapt_ts(preprocessing_function,training_set):
  # Set a large sequence length to find the longest sample in the training set.
  sequence_length = 15000
  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length,
      encoding='ISO-8859-1')

  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)

  model = tf.keras.models.Sequential()
  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
  model.add(vectorize_layer)

  longest_sample_length=1

  for element in training_set:
    authorDocument=element[0]
    label=element[1]

    author_batch = tf.expand_dims(authorDocument, axis=0)
    out = model(author_batch)
    # Convert token list to numpy array.
    token_list = out.numpy()[0]
    token_list = np.trim_zeros(token_list,'b')
    if longest_sample_length < len(token_list):
      longest_sample_length = len(token_list)

  print("Length of the longest sample is:", longest_sample_length)

  # After tokenization longest_sample_length covers all the document lenghts in our dataset.
  sequence_length = longest_sample_length

  vectorize_layer = TextVectorization(
      standardize=preprocessing_function,
      output_mode='int',
      output_sequence_length=sequence_length,
      encoding='ISO-8859-1')

  # Finally adapt the vectorize layer.
  train_text = training_set.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)
  global max_features
  max_features=len(vectorize_layer.get_vocabulary()) + 1
  return vectorize_layer

In [12]:
model_results = defaultdict(lambda: defaultdict(list))
prepro_functions_dict_base = {
    'DON':DON,
    'LOW':LOW,
    'RSW':RSW,
    'STM':STM
    }

# 3 prepro functions = 15 combs...+1 for do_nothing

prepro_functions_dict_comb = {
    # 1. Do nothing
    'DON': DON,
    # 2. Lowercasing
    'LOW':LOW,
    # 3. Removing Stopwords
    'RSW':RSW,
    # 4. Porter Stemming
    'STM':STM,
    # 5. LOW->RSW
    'LOW_RSW':LOW_RSW,
    # 6. LOW->STM
    'LOW_STM':LOW_STM,
    # 7. RSW->LOW
    'RSW_LOW':RSW_LOW,
    # 8. RSW->STM
    'RSW_STM':RSW_STM,
    # 9. STM->LOW
    'STM_LOW':STM_LOW,
    # 10. STM->RSW
    'STM_RSW':STM_RSW,
    # 11. LOW->STM->RSW
    'LOW_STM_RSW':LOW_STM_RSW,
    # 12. LOW->RSW->STM
    'LOW_RSW_STM':LOW_RSW_STM,
    # 13. STM->LOW->RSW
    'STM_LOW_RSW':STM_LOW_RSW,
    # 14. STM->RSW->LOW
    'STM_RSW_LOW':STM_RSW_LOW,
    # 15. RSW->LOW->STM
    'RSW_LOW_STM':RSW_LOW_STM,
    # 16. RSW->STM->LOW
    'RSW_STM_LOW':RSW_STM_LOW
}

In [13]:
for key in prepro_functions_dict_comb:
    print("\n\n* * * * EVALUATION USING", key, "AS PREPROCESSING FUNCTION * * * *")

    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test_text, y_test)).shuffle(buffer_size=len(test_df), seed=1, reshuffle_each_iteration=False).batch(1).take(50)

    # Preprocess training set to build a dictionary.
    vectorize_layer = preprocess_and_adapt_ts(prepro_functions_dict_comb[key],train_ds)
    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)
    print("\n\n***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******")
    # Print a raw and a preprocessed sample.
    for element in train_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)

      print("Sample considered is: ", author_batch)
      print("Preprocessed: ", str(prepro_functions_dict_comb[key](author_batch.numpy())))
      break

    train_ds = tf.data.Dataset.from_tensor_slices((X_train_text, y_train)).shuffle(buffer_size=len(train_df), seed=1, reshuffle_each_iteration=False).batch(1).take(200)

    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
    model.add(vectorize_layer)

    training_labels=[]
    training_samples=[]

    max_features=len(vectorize_layer.get_vocabulary()) + 1

    for element in train_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)
        
      text_vect_out = vectorize_layer(author_batch)

      training_labels.append(label.numpy())
      current_sample=np.zeros(max_features)
      for current_token in text_vect_out[0][:].numpy():
        current_sample[current_token]+=1
      training_samples.append(current_sample)

    training_labels=np.array(training_labels)
    training_samples=np.array(training_samples)

    test_labels=[]
    test_samples=[]

    for element in test_ds:
      authorDocument=element[0]
      label=element[1]
      author_batch = tf.expand_dims(authorDocument, 0)

      text_vect_out = vectorize_layer(author_batch)

      test_labels.append(label.numpy())
      current_sample=np.zeros(max_features)
      for current_token in text_vect_out[0][:].numpy():
        current_sample[current_token]+=1
      test_samples.append(current_sample)

    test_labels=np.array(test_labels)
    test_samples=np.array(test_samples)

    models = {
        "AdaBoost": AdaBoostClassifier(random_state=0),
        "XGBoost": XGBClassifier(random_state=0),
        "RandomForest": RandomForestClassifier(random_state=0),
        "DecisionTree": DecisionTreeClassifier(random_state=0)
    }

    for name, model in models.items():
        model.fit(training_samples, training_labels.ravel())
        acc = model.score(test_samples, test_labels)
        model_results[key][name].append(acc)
        print(f"{name} Accuracy on Test set ->", acc)
    



* * * * EVALUATION USING DON AS PREPROCESSING FUNCTION * * * *


2025-05-02 16:05:51.728878: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-05-02 16:05:53.255188: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 146


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b"   I       t   h   o   u   g   h   t       t   h   i   s       w   a   s       t   h   e       a   n   n   o   u   n   c   e   m   e   n   t       o   f       a       J   a   n   e   t       J   a   c   k   s   o   n       r   e   m   i   x       a   t       f   i   r   s   t       a   n   d       I   '   m       k   i   n   d       o   f       d   i   s   a   p   p   o   i   n   t   e   d   .   "]], shape=(1, 1), dtype=string)


2025-05-02 16:05:53.602790: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


AdaBoost Accuracy on Test set -> 0.36
XGBoost Accuracy on Test set -> 0.32
RandomForest Accuracy on Test set -> 0.42
DecisionTree Accuracy on Test set -> 0.26


* * * * EVALUATION USING LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 146


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b"   i       t   h   o   u   g   h   t       t   h   i   s       w   a   s       t   h   e       a   n   n   o   u   n   c   e   m   e   n   t       o   f       a       j   a   n   e   t       j   a   c   k   s   o   n       r   e   m   i   x       a   t       f   i   r   s   t       a   n   d       i   '   m       k   i   n   d       o   f       d   i   s   a   p   p   o   i   n   t   e   d   .   "]], shape=(1, 1), dtype=string)


2025-05-02 16:05:55.603567: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


AdaBoost Accuracy on Test set -> 0.32
XGBoost Accuracy on Test set -> 0.38
RandomForest Accuracy on Test set -> 0.38
DecisionTree Accuracy on Test set -> 0.32


* * * * EVALUATION USING RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 99


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'I h u g h h w h e n n u n c e e n f J n e J c k n r e x f r n I k n f p p n e']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.34
XGBoost Accuracy on Test set -> 0.42
RandomForest Accuracy on Test set -> 0.36
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING STM AS PREPROCESSING FUNCTION * * * *


2025-05-02 16:06:00.539668: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 138


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'i t h o u g h t t h i s w a s t h e a n n o u n c e m e n t o f a j a n e t j a c k s o n r e m i x a t f i r s t a n d i m k i n d o f d i s a p p o i n t e d']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.4
XGBoost Accuracy on Test set -> 0.38
RandomForest Accuracy on Test set -> 0.34
DecisionTree Accuracy on Test set -> 0.26


* * * * EVALUATION USING LOW_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 93


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'h u g h h w h e n n u n c e e n f j n e j c k n r e x f r n k n f p p n e']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.24
XGBoost Accuracy on Test set -> 0.42
RandomForest Accuracy on Test set -> 0.4
DecisionTree Accuracy on Test set -> 0.38


* * * * EVALUATION USING LOW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 138


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'i t h o u g h t t h i s w a s t h e a n n o u n c e m e n t o f a j a n e t j a c k s o n r e m i x a t f i r s t a n d i m k i n d o f d i s a p p o i n t e d']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.4
XGBoost Accuracy on Test set -> 0.38
RandomForest Accuracy on Test set -> 0.34
DecisionTree Accuracy on Test set -> 0.26


* * * * EVALUATION USING RSW_LOW AS PREPROCESSING FUNCTION * * * *


2025-05-02 16:06:12.770255: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Length of the longest sample is: 99


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   i       h       u       g       h       h       w       h       e       n       n       u       n       c       e       e       n       f       j       n       e       j       c       k       n       r       e       x       f       r       n       i       k       n       f       p       p       n       e   ']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.44
XGBoost Accuracy on Test set -> 0.44
RandomForest Accuracy on Test set -> 0.36
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING RSW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 99


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'i h u g h h w h e n n u n c e e n f j n e j c k n r e x f r n i k n f p p n e']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.44
XGBoost Accuracy on Test set -> 0.44
RandomForest Accuracy on Test set -> 0.36
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING STM_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 138


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   i       t       h       o       u       g       h       t       t       h       i       s       w       a       s       t       h       e       a       n       n       o       u       n       c       e       m       e       n       t       o       f       a       j       a       n       e       t       j       a       c       k       s       o       n       r       e       m       i       x       a       t       f       i       r       s    



AdaBoost Accuracy on Test set -> 0.4
XGBoost Accuracy on Test set -> 0.36
RandomForest Accuracy on Test set -> 0.32
DecisionTree Accuracy on Test set -> 0.28


* * * * EVALUATION USING STM_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 93


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'h u g h h w h e n n u n c e e n f j n e j c k n r e x f r n k n f p p n e']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.26
XGBoost Accuracy on Test set -> 0.42
RandomForest Accuracy on Test set -> 0.34
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING LOW_STM_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 93


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'h u g h h w h e n n u n c e e n f j n e j c k n r e x f r n k n f p p n e']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.26
XGBoost Accuracy on Test set -> 0.42
RandomForest Accuracy on Test set -> 0.34
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING LOW_RSW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 93


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'h u g h h w h e n n u n c e e n f j n e j c k n r e x f r n k n f p p n e']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.26
XGBoost Accuracy on Test set -> 0.42
RandomForest Accuracy on Test set -> 0.34
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING STM_LOW_RSW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 93


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'h u g h h w h e n n u n c e e n f j n e j c k n r e x f r n k n f p p n e']], shape=(1, 1), dtype=string)


2025-05-02 16:06:40.204529: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


AdaBoost Accuracy on Test set -> 0.26
XGBoost Accuracy on Test set -> 0.42
RandomForest Accuracy on Test set -> 0.34
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING STM_RSW_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 93


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   h       u       g       h       h       w       h       e       n       n       u       n       c       e       e       n       f       j       n       e       j       c       k       n       r       e       x       f       r       n       k       n       f       p       p       n       e   ']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.26
XGBoost Accuracy on Test set -> 0.42
RandomForest Accuracy on Test set -> 0.34
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING RSW_LOW_STM AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 99


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'i h u g h h w h e n n u n c e e n f j n e j c k n r e x f r n i k n f p p n e']], shape=(1, 1), dtype=string)




AdaBoost Accuracy on Test set -> 0.44
XGBoost Accuracy on Test set -> 0.44
RandomForest Accuracy on Test set -> 0.36
DecisionTree Accuracy on Test set -> 0.36


* * * * EVALUATION USING RSW_STM_LOW AS PREPROCESSING FUNCTION * * * *
Length of the longest sample is: 99


***** FINISHED PROCESSING AND ADAPTING THE TRAINING SET, THE SIMULATION BEGINS *******
Sample considered is:  tf.Tensor([[b"I thought this was the announcement of a Janet Jackson remix at first and I'm kind of disappointed."]], shape=(1, 1), dtype=string)
Preprocessed:  tf.Tensor([[b'   i       h       u       g       h       h       w       h       e       n       n       u       n       c       e       e       n       f       j       n       e       j       c       k       n       r       e       x       f       r       n       i       k       n       f       p       p       n       e   ']], shape=(1, 1), dtype=string)
AdaBoost Accuracy on Test set -> 0.44
XGBoost Accuracy on Test set -> 0.44
RandomForest Accuracy on T



# Результаты

In [14]:
best_result_per_model = {}

print("PREPRO FUNCTION         | MODEL NAME        | ACCURANCY   ")
print("-" * 60)

for prepro_func in prepro_functions_dict_comb:
    for model_name, scores in model_results[prepro_func].items():
        acc = round(scores[-1], 4)
        if model_name not in best_result_per_model or acc > best_result_per_model[model_name][1]:
            best_result_per_model[model_name] = (prepro_func, acc)
        print(f"{prepro_func:23} | {model_name:17} | {acc:>6.4f}")
    print("-" * 60)

PREPRO FUNCTION         | MODEL NAME        | ACCURANCY   
------------------------------------------------------------
DON                     | AdaBoost          | 0.3600
DON                     | XGBoost           | 0.3200
DON                     | RandomForest      | 0.4200
DON                     | DecisionTree      | 0.2600
------------------------------------------------------------
LOW                     | AdaBoost          | 0.3200
LOW                     | XGBoost           | 0.3800
LOW                     | RandomForest      | 0.3800
LOW                     | DecisionTree      | 0.3200
------------------------------------------------------------
RSW                     | AdaBoost          | 0.3400
RSW                     | XGBoost           | 0.4200
RSW                     | RandomForest      | 0.3600
RSW                     | DecisionTree      | 0.3600
------------------------------------------------------------
STM                     | AdaBoost          | 0.4000
STM     

In [15]:
print("MODEL NAME         | BEST PREPRO FUNCTION   | MAX ACCURACY")
print("-" * 60)
for model_name, (best_func, acc) in best_result_per_model.items():
    print(f"{model_name:18} | {best_func:22} | {acc:.4f}")

MODEL NAME         | BEST PREPRO FUNCTION   | MAX ACCURACY
------------------------------------------------------------
AdaBoost           | RSW_LOW                | 0.4400
XGBoost            | RSW_LOW                | 0.4400
RandomForest       | DON                    | 0.4200
DecisionTree       | LOW_RSW                | 0.3800


# Сохраним результаты в отдельную таблицу

In [16]:
prepo_path = "../reports/preprocessing_combinations/tweet_eval_sentiment/"
os.makedirs(prepo_path, exist_ok=True)

In [17]:
rows = []

for prepo_func, model_scores in model_results.items():
    for model_name, acc in model_scores.items():
        rows.append({
            "prepo_func": prepo_func,
            "model": model_name,
            "accuracy": round(acc[-1], 4)
        })

df_full = pd.DataFrame(rows)
df_full.to_csv(f"{prepo_path}full.csv", index=False)

In [18]:
df_best = pd.DataFrame([
    {"model": model, "best_prepo_func": func, "max_accuracy": acc}
    for model, (func, acc) in best_result_per_model.items()
])
df_best.to_csv(f"{prepo_path}best.csv", index=False)