In [6]:
import tensorflow_datasets as tfds
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
import time
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from minisom import MiniSom
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Input, Dense, Lambda , RepeatVector, TimeDistributed, Embedding, LSTM
from keras.models import Model
from keras import backend as K
from joblib import Parallel, delayed
import tensorflow as tf
from tensorflow.keras import layers, losses, models
from keras.datasets import imdb
from keras.models import Sequential, Model
from sklearn.model_selection import train_test_split

In [5]:
#!pip install catboost

In [4]:
#!pip install minisom

In [7]:
def load_imdb_dataset():
    (train_data, test_data), ds_info = tfds.load(
        'imdb_reviews/plain_text',
        split=['train', 'test'],
        as_supervised=True,
        with_info=True
    )
    return train_data, test_data, ds_info

train_data, test_data, ds_info = load_imdb_dataset()

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteDA3IHZ/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteDA3IHZ/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteDA3IHZ/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [8]:
def prepare_text_data(train_data, test_data):
    train_reviews = [review.numpy().decode('utf8') for review, _ in train_data]
    train_labels = [label.numpy() for _, label in train_data]

    test_reviews = [review.numpy().decode('utf8') for review, _ in test_data]
    test_labels = [label.numpy() for _, label in test_data]

    vectorizer = CountVectorizer(stop_words='english', max_features=10000)
    X_train = vectorizer.fit_transform(train_reviews).toarray()
    X_test = vectorizer.transform(test_reviews).toarray()

    return X_train, np.array(train_labels), X_test, np.array(test_labels)

X_train, y_train, X_test, y_test = prepare_text_data(train_data, test_data)


# Classifiers on Original Dataset

# XGBoost

In [None]:
start_time = time.time()
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
execution_time = time.time() - start_time

xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print("Accuracy", xgb_accuracy)
print("Time",execution_time )

Accuracy 0.85276
Time 94.62994337081909


# Light GBM



In [None]:
start_time = time.time()
lgbm_model = LGBMClassifier()
lgbm_model.fit(X_train, y_train)
lgbm_predictions = lgbm_model.predict(X_test)
execution_time_lgb = time.time() - start_time

lgbm_accuracy = accuracy_score(y_test, lgbm_predictions)

print("Accuracy", lgbm_accuracy)
print("Time",execution_time_lgb )

[LightGBM] [Info] Number of positive: 12500, number of negative: 12500
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 4.481717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38421
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 9766
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy 0.85728
Time 30.994879722595215


# CatBoost

In [None]:
start_time = time.time()
catboost_model = CatBoostClassifier(verbose=0)

catboost_model.fit(X_train, y_train)
catboost_predictions = catboost_model.predict(X_test)

execution_time_cat= time.time() - start_time

catboost_accuracy = accuracy_score(y_test, catboost_predictions)

print("Accuracy", catboost_accuracy)
print("Time",execution_time_cat )

Accuracy 0.86348
Time 533.8533637523651


# Classifiers with SOM

In [10]:
num_words = 10000
maxlen = 100
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=num_words)

x_train_padded = pad_sequences(x_train, maxlen=maxlen)
x_test_padded = pad_sequences(x_test, maxlen=maxlen)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [11]:
x_train_flat = x_train_padded.reshape(len(x_train_padded), -1)
x_test_flat = x_test_padded.reshape(len(x_test_padded), -1)

In [12]:
correct_input_len = maxlen

som = MiniSom(5, 5, correct_input_len, sigma=0.3, learning_rate=0.5)

som.train_random(x_train_flat, 1000)

In [13]:
X_train_som, X_val_som, y_train_som, y_val_som = train_test_split(x_train_flat, y_train, test_size=0.2, random_state=42)

# XGBoost with SOM

In [14]:
start_time = time.time()

xgb_classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_classifier.fit(X_train_som, y_train_som)

execution_time = time.time() - start_time
y_pred = xgb_classifier.predict(X_val_som)

accuracy = accuracy_score(y_val_som, y_pred)
print(f'Validation accuracy: {accuracy:.2f}')
print("Time",execution_time)

Validation accuracy: 0.56
Time 5.166607141494751


# LightGBM with SOM


In [18]:
start_time = time.time()

lgbm_classifier = LGBMClassifier()
lgbm_classifier.fit(X_train_som, y_train_som, eval_set=[(X_val_som, y_val_som)])
y_pred = lgbm_classifier.predict(X_val_som)
execution_time_lgb = time.time() - start_time

accuracy_lgbm = accuracy_score(y_val_som, y_pred)

print("Accuracy", accuracy_lgbm)
print("Time",execution_time_lgb )

[LightGBM] [Info] Number of positive: 9937, number of negative: 10063
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060873 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 20000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496850 -> initscore=-0.012600
[LightGBM] [Info] Start training from score -0.012600
Accuracy 0.5792
Time 7.266053199768066


# CAT Boost with SOM

In [20]:
start_time = time.time()
catboost_classifier = CatBoostClassifier(iterations=100, learning_rate=1, depth=2, loss_function='Logloss', verbose=False)

catboost_classifier.fit(X_train_som, y_train_som, eval_set=(X_val_som, y_val_som), use_best_model=True)
execution_time_cat= time.time() - start_time

y_pred = catboost_classifier.predict(X_val_som)
accuracy_cat = accuracy_score(y_val_som, y_pred)

print("Accuracy", accuracy_cat)
print("Time",execution_time_cat)

Accuracy 0.561
Time 3.52058744430542


# RBM

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [9]:
#rbm = BernoulliRBM(n_components=100, learning_rate=0.01, n_iter=5, verbose=True)
#rbm.fit(X_train_scaled)

In [None]:
X_train_transformed = rbm.transform(X_train_scaled)
X_test_transformed = rbm.transform(X_test_scaled)

## XGBoost with RBM

In [None]:
start_time = time.time()

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_transformed, y_train)
xgb_predictions = xgb_model.predict(X_test_transformed)
execution_time = time.time() - start_time

xgb_accuracy_rbm = accuracy_score(y_test, xgb_predictions)
print("Accuracy", xgb_accuracy_rbm)
print("Time",execution_time )

Accuracy 0.52332
Time 2.6180198192596436


## LightGBM with RBM

In [None]:
start_time = time.time()

lgbm_model = LGBMClassifier()
lgbm_model.fit(X_train_transformed, y_train)
lgbm_predictions = lgbm_model.predict(X_test_transformed)
execution_time_lgb = time.time() - start_time

lgbm_accuracy_rbm = accuracy_score(y_test, lgbm_predictions)

print("Accuracy", lgbm_accuracy_rbm)
print("Time",execution_time_lgb )

[LightGBM] [Info] Number of positive: 12500, number of negative: 12500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy 0.54988
Time 2.78187894821167


## CATBoost with RBM

In [None]:
start_time = time.time()
catboost_model = CatBoostClassifier(verbose=0)  # To keep the output clean

catboost_model.fit(X_train_transformed, y_train)
catboost_predictions = catboost_model.predict(X_test_transformed)

execution_time_cat= time.time() - start_time

catboost_accuracy_rbm = accuracy_score(y_test, catboost_predictions)

print("Accuracy", catboost_accuracy_rbm)
print("Time",execution_time_cat )

Accuracy 0.51344
Time 20.745953798294067


# LSTM Based Autoencoder



In [None]:
max_features = 10000  # Number of words to consider as features
maxlen = 500  # Cut texts after this number of words
batch_size = 32

In [None]:
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 train sequences
25000 test sequences


In [None]:
print('Pad sequences (samples x time)')
input_train = pad_sequences(input_train, maxlen=maxlen)
input_test = pad_sequences(input_test, maxlen=maxlen)
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

Pad sequences (samples x time)
input_train shape: (25000, 500)
input_test shape: (25000, 500)


In [None]:
embedding_dim = 50

In [None]:
inputs = Input(shape=(None,))
x = Embedding(max_features, embedding_dim)(inputs)
encoded = LSTM(32)(x)


In [None]:
decoded = RepeatVector(maxlen)(encoded)
decoded = LSTM(embedding_dim, return_sequences=True)(decoded)
decoded = TimeDistributed(Dense(max_features, activation='softmax'))(decoded)


In [None]:
autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
encoder = Model(inputs, encoded)

In [None]:
input_train_enc = np.expand_dims(input_train, -1)
input_test_enc = np.expand_dims(input_test, -1)

In [None]:
#autoencoder.fit(input_train, input_train_enc, epochs=1, batch_size=batch_size, validation_split=0.2)


In [None]:
encoded_train = encoder.predict(input_train)
encoded_test = encoder.predict(input_test)



## XGBoost with Autoencoder

In [None]:
start_time = time.time()

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(encoded_train, y_train)

execution_time = time.time() - start_time

y_pred = xgb_model.predict(encoded_test)

xgb_accuracy_enc = accuracy_score(y_test, y_pred)
print("Accuracy", xgb_accuracy_enc)
print("Time",execution_time )

Accuracy 0.61792
Time 1.0650792121887207


## LightGBM with Autoencoder

In [None]:
start_time = time.time()

lgbm_model = LGBMClassifier()
lgbm_model.fit(encoded_train, y_train)
execution_time_lgb = time.time() - start_time

lgbm_predictions = lgbm_model.predict(encoded_test)
lgbm_accuracy_enc = accuracy_score(y_test, lgbm_predictions)

print("Accuracy", lgbm_accuracy_enc)
print("Time",execution_time_lgb )

[LightGBM] [Info] Number of positive: 12500, number of negative: 12500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8160
[LightGBM] [Info] Number of data points in the train set: 25000, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Accuracy 0.61288
Time 0.9537703990936279


## CatBoost with Autoencoder

In [None]:
start_time = time.time()
catboost_model = CatBoostClassifier(verbose=0)  # To keep the output clean

catboost_model.fit(encoded_train, y_train)
catboost_predictions = catboost_model.predict(encoded_test)

execution_time_cat= time.time() - start_time

catboost_accuracy_enc = accuracy_score(y_test, catboost_predictions)

print("Accuracy", catboost_accuracy_enc)
print("Time",execution_time_cat )

Accuracy 0.61576
Time 18.81877374649048
