## Import Libraries

In [6]:
import re
import ast
import string
import sklearn

import numpy as np
import pandas as pd
import tensorflow as tf

import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff


from collections import Counter

from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

## Load Data

In [7]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,product_id,title,vendor,tags,category
0,3937721221199,Fidele Super Premium Adult Large Breed Dog Food,Fidele,"['Adult', 'Bangalore', 'Chennai', 'Chicken', '...",Animals & Pet Supplies
1,7353058033889,Foldable Pet Toys Linen Storage,Cap Point,[],Animals & Pet Supplies
2,6594773549129,Bok Dok Diaper,Pets Home,"['Brand_Pet Arabia', 'Category_Pets Home', 'Ca...",Animals & Pet Supplies
3,4802008318014,Tastybone Toy Chicken,TastyBone,[],Animals & Pet Supplies
4,1779705151539,Leather Leash Tab - Short Dog Leash,Mighty Paw,"['Leash', 'Leash Tab', 'Training']",Animals & Pet Supplies
...,...,...,...,...,...
5265,4637089464407,Candylab MOO Milk Van,Candylab,"['3 Years +', 'candylab', 'Discount Products',...",Vehicles & Parts
5266,4996632444987,"Truck - Modern Era Vehicles -- Red, White - S...",Woodland Scenics,"['HO Scale', 'ho-scale-items', 'vehicles', 'wo...",Vehicles & Parts
5267,5528541003927,Car Sticker Flags Decal American Flag Sticker for,Cyan Selene,['Other'],Vehicles & Parts
5268,1395163889730,Lazer Helmets Bayamo Pit Bull - Full Face,OPEN BOX BARGAINS,"['65061090', 'Antiscratch Pinlock Ready Visor'...",Vehicles & Parts


In [8]:
# counting the number of each label
df['category'].value_counts()

category
Apparel & Accessories        1000
Animals & Pet Supplies        500
Food, Beverages & Tobacco     400
Sporting Goods                400
Luggage & Bags                400
Home & Garden                 400
Health & Beauty               400
Media                         300
Toys & Games                  300
Furniture                     200
Baby & Toddler                200
Arts & Entertainment          200
Electronics                   100
Business & Industrial         100
Office Supplies               100
Vehicles & Parts              100
Hardware                       50
Cameras & Optics               50
Software                       50
Religious & Ceremonial         20
Name: count, dtype: int64

In [9]:
# List of common stopwords
stopwords = set([
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
    "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
    "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
    "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how",
    "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", 
    "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
    "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such",
    "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", 
    "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", 
    "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", 
    "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", 
    "you're", "you've", "your", "yours", "yourself", "yourselves"
])

# Tokenizing the titles and filtering stopwords
words = []
for title in df['title']:
    # Tokenize and convert to lowercase
    tokens = title.split()
    tokens = [word.lower().strip(string.punctuation) for word in tokens]
    words.extend(tokens)

# Filter out stopwords
filtered_words = [word for word in words if word not in stopwords and word]

# Count the occurrences of each word
word_counts = Counter(filtered_words)

word_counts.most_common(10)

[('backpack', 223),
 ('black', 201),
 ('dog', 168),
 ('toy', 162),
 ('bag', 139),
 ('dress', 128),
 ('blue', 94),
 ('set', 93),
 ('earrings', 90),
 ('white', 82)]

### Tokenizing and Standardizing
Tokenize and standardizing the sentences. Also filtering the words based on a predefined regex. 

In [10]:
# the function for extracting and standardizing the sentences
def text_extraction(dfi):
    # in this function, we concatenate text feature parts of the data as a sentence
    sentence = ' '.join([dfi['title'], str(dfi['vendor']), dfi['tags']])
    # Remove punctuations
    sentence = re.sub('[^a-zA-Z0-9$.]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    # Changint to lowercase
    sentence = sentence.lower()
    return sentence

# printing 10 sample sentences
for i in range(10):
    print(text_extraction(df.iloc[i]))

fidele super premium adult large breed dog food fidele adult bangalore chennai chicken doberman dog dry foods fidele german shepherd golden retriever great dane highpriority imported labrador less than 1000 less than 2000 less than 500 mastiff orange pet nutrition 
foldable pet toys linen storage cap point 
bok dok diaper pets home brand pet arabia category pets home category small pets supplies type pet home type pet supplies 
tastybone toy chicken tastybone 
leather leash tab short dog leash mighty paw leash leash tab training 
pridebites texas guitar dog toy pride bites brand pridebites toy type plush 
burns sensitive pork potato burns 10 25 25 50 50 75 adult burns coat dog food food delivery jansale18 natural nonsale19 sensitive size 12kg size 2kg size 6kg skin 
bully sticks dog toy adog.co bully sticks dog chew toys dog toys 
kazoo tough giraffe dog toy kazoo brand kazoo june2021 kazoo material plush plush 
orgo dog biscuits fresh milk petku brand orgo category dogs dogs lifestage

In [11]:
# creating the dataset
rows = [{'text': text_extraction(df.iloc[i]), 'label': df.iloc[i]['category']} for i in range(len(df))]
dataset = pd.DataFrame(rows)


# creating integer labels for multiclass training
dataset['label_int'] = pd.Categorical(dataset['label']).codes
dataset

Unnamed: 0,text,label,label_int
0,fidele super premium adult large breed dog foo...,Animals & Pet Supplies,0
1,foldable pet toys linen storage cap point,Animals & Pet Supplies,0
2,bok dok diaper pets home brand pet arabia cate...,Animals & Pet Supplies,0
3,tastybone toy chicken tastybone,Animals & Pet Supplies,0
4,leather leash tab short dog leash mighty paw l...,Animals & Pet Supplies,0
...,...,...,...
5265,candylab moo milk van candylab 3 years candyla...,Vehicles & Parts,19
5266,truck modern era vehicles red white scale ho w...,Vehicles & Parts,19
5267,car sticker flags decal american flag sticker ...,Vehicles & Parts,19
5268,lazer helmets bayamo pit bull full face open b...,Vehicles & Parts,19


In [12]:
# extracting the names of the labels
labels_names = list(Counter(dataset['label']).keys())
labels_names

['Animals & Pet Supplies',
 'Apparel & Accessories',
 'Arts & Entertainment',
 'Baby & Toddler',
 'Business & Industrial',
 'Cameras & Optics',
 'Electronics',
 'Food, Beverages & Tobacco',
 'Furniture',
 'Hardware',
 'Health & Beauty',
 'Home & Garden',
 'Luggage & Bags',
 'Media',
 'Office Supplies',
 'Religious & Ceremonial',
 'Software',
 'Sporting Goods',
 'Toys & Games',
 'Vehicles & Parts']

In [13]:
# printing each integer label and its corresponding name label
for i, label in enumerate(labels_names):
    print("Label", i, "corresponds to", label)

Label 0 corresponds to Animals & Pet Supplies
Label 1 corresponds to Apparel & Accessories
Label 2 corresponds to Arts & Entertainment
Label 3 corresponds to Baby & Toddler
Label 4 corresponds to Business & Industrial
Label 5 corresponds to Cameras & Optics
Label 6 corresponds to Electronics
Label 7 corresponds to Food, Beverages & Tobacco
Label 8 corresponds to Furniture
Label 9 corresponds to Hardware
Label 10 corresponds to Health & Beauty
Label 11 corresponds to Home & Garden
Label 12 corresponds to Luggage & Bags
Label 13 corresponds to Media
Label 14 corresponds to Office Supplies
Label 15 corresponds to Religious & Ceremonial
Label 16 corresponds to Software
Label 17 corresponds to Sporting Goods
Label 18 corresponds to Toys & Games
Label 19 corresponds to Vehicles & Parts


### Balance the Classes
Upsampling:

In [14]:
max_samples = dataset['label'].value_counts().max()
max_samples

1000

In [15]:
balanced_data_list = []

for class_name, group in dataset.groupby('label'):
    if len(group) < max_samples:
        upsampled_group = resample(group, 
                                   replace=True, 
                                   n_samples=max_samples, 
                                   random_state=42)
    else:
        upsampled_group = group

    balanced_data_list.append(upsampled_group)

balanced_data = pd.concat(balanced_data_list)

In [16]:
balanced_data

Unnamed: 0,text,label,label_int
102,premium boxer dog reflective harness ploocy bo...,Animals & Pet Supplies,0
435,midwest icrate fold and carry double door coll...,Animals & Pet Supplies,0
348,gourmet deli wfish herring rollover brand roll...,Animals & Pet Supplies,0
270,porkhide knotted bone rollover brand rollover ...,Animals & Pet Supplies,0
106,jbj protein skimmer jbj regular price,Animals & Pet Supplies,0
...,...,...,...
5179,extractor shampoo low foam 3d car care carpet ...,Vehicles & Parts,19
5236,motorcycle amber red turn signal light unbrand...,Vehicles & Parts,19
5187,bennett one box indication unit for bolt elect...,Vehicles & Parts,19
5269,deutz agrotron tractor siku $0 to $25 diecast ...,Vehicles & Parts,19


### Data sampling
After standardizing the dataset, It's time to **split** the dataset into train dataset and test datasets. The selected ratio of data is 80% of the dataset for training and 20% of the dataset is for testing and validation. 50% of the testing dataset is for validation and the rest is for testing our model.

In [17]:
# splitting dataset to train, validation, and test dataframes
train_df, test_df= train_test_split(balanced_data, test_size=0.2, random_state=42)
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)

print(f"Number of samples in training set: {len(train_df)}")
print(f"Number of samples in validation set: {len(val_df)}")
print(f"Number of samples in test set: {len(test_df)}")

Number of samples in training set: 16000
Number of samples in validation set: 2000
Number of samples in test set: 1046


In [18]:
# extracting texts and labels from dataframes
train_texts = train_df['text']
train_labels = train_df['label_int']
val_texts = val_df['text']
val_labels = val_df['label_int']
test_texts = test_df['text']
test_labels = test_df['label_int']

In [19]:
val_texts.shape

(2000,)

### Batching
Batching in machine learning refers to the process of dividing a dataset into smaller groups or batches to be processed by a model. Rather than training or evaluating on the entire dataset at once, the data is split into smaller subsets that are processed in parallel or sequentially.

Batching is commonly used in training deep learning models, especially for tasks such as image recognition or natural language processing, where the datasets can be very large. By splitting the dataset into batches, it reduces the memory requirements of the model, making it possible to process larger datasets that might otherwise exceed the memory limitations of the hardware.

Batching also helps to speed up the training process by allowing the model to process the dataset in parallel. This can significantly reduce the time required to train the model and allows for more iterations of training and evaluation to be performed in a given amount of time.

We need to split our datasets into 32 sized batches.

In [20]:
# creating data generators with batch size 32
batch_size = 32
raw_train_batch = tf.data.Dataset.from_tensor_slices((train_texts, train_labels)).batch(batch_size)
raw_val_batch = tf.data.Dataset.from_tensor_slices((val_texts, val_labels)).batch(batch_size)
raw_test_batch = tf.data.Dataset.from_tensor_slices((test_texts, test_labels)).batch(batch_size)

# printing texts and labels of a batch of raw train
for text, label in raw_train_batch.take(1):
    print('Texts: {}'.format(text))
    print('labels: {}'.format(label))

Texts: [b'blackrapid curve breathe black rapid brand black rapid straps slings style camera straps '
 b'bubba bear cot sheet baby jalebi bedding bedding cot sheets bubba bear condition new '
 b'stein world brendan table stein world assembled birch birchwood black cream elk inv elkgroupinternational legs living room furniture sale mdf round rustic stein world stein world accent tables '
 b'quadrello di bufala cheese cut wrapped by igourmet category cheese cut cheeses milk type buffalo nutrition full set origin italy shipping perishable texture semi soft type stinky and washed rind wholesale cheese collection '
 b'frida kahlo standing next to an agave plant toni frissell 2019 agave black white black and white botanicals burrow desert fashion frida frida kahlo frissell kahlo large medium mexico pd artist photography plant plants small toni toni frissell vintage vintage edition vintage editions vogue '
 b'distaut orszaghaz peter zeglis city green landscape photography print water '
 b'fore

In [21]:
# counting how many words are there in the whole texts of the dataset
num_of_words = 0
for i in dataset['text']: num_of_words += len(i.split())

print(num_of_words)

112574


There are about 112000 words in the texts.



In [22]:
# counting max sequence length and how many non-repetitive words are there in the whole texts of the dataset
l = []
max_seq_lenght = 0
for i in dataset['text']:
    lenght = len(i.split())
    if lenght > max_seq_lenght: max_seq_lenght = lenght
    for j in i.split():
        if j not in l: l.append(j)

print(max_seq_lenght)
print(len(l))

309
18932


Maximum sequence length is 309 and There are about 19000 non-repetitive words in the whole dataset texts. So we set max word features to 10000 and sequence length to 350.

### Vectorizing the datasets
Each sentece has a word vector with a length of 350. Each unique word is mapped to a number between 0 and 10000

Text vectorization is the process of converting text data into numerical vectors that can be used as input to machine learning algorithms. The goal of text vectorization is to represent text data in a way that captures its meaning and allows algorithms to process it effectively.   
   
Overall, text vectorization is a critical component of many NLP applications and is essential for effectively processing and analyzing large volumes of textual data.

In [23]:
# setting the text vectorization layer with 20000 words and 320 sequence length
max_features = 20000
sequence_length = 320

vectorize_layer = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# fitting the state of the preprocessing layer to the train set. This will cause the model to build an index of strings to integers.
vectorize_layer.adapt(train_texts)

# defining the vectorize text function
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# retrieving a sample from a batch of texts and labels from the train set
text_batch, label_batch = next(iter(raw_train_batch))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", first_label)
print("Vectorized text", vectorize_text(first_review, first_label))

Review tf.Tensor(b'blackrapid curve breathe black rapid brand black rapid straps slings style camera straps ', shape=(), dtype=string)
Label tf.Tensor(5, shape=(), dtype=int8)
Vectorized text (<tf.Tensor: shape=(1, 320), dtype=int64, numpy=
array([[4596, 3244, 4590,   12, 2353,   20,   12, 2353, 1124, 4341,   63,
         216, 1124,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,

In [24]:
# getting corresponding word of each integer
print("1401 ---> ",vectorize_layer.get_vocabulary()[1401])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1401 --->  fagus
 313 --->  days
Vocabulary size: 17283


In [25]:
# creating train, val, and test vectorized dataset and prefetching them
train_ds = raw_train_batch.map(vectorize_text)
val_ds = raw_val_batch.map(vectorize_text)
test_ds = raw_test_batch.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Building the Model
In this section we'll build our text classifier model which has a **text embedding layer**.

Text embedding is a technique used in natural language processing (NLP) to represent words, phrases, or entire documents as dense vectors of numerical values. The idea behind text embedding is to capture the meaning of textual data in a way that is computationally efficient and allows algorithms to make sense of the relationships between different pieces of text.

The resulting embedding vectors are typically of fixed dimensionality and can be used as input to machine learning algorithms for various tasks such as text classification, sentiment analysis, and information retrieval. By representing text as vectors, NLP models can operate on them as continuous mathematical objects and easily capture complex relationships between words and phrases.

In [26]:
# model configuration
embedding_dim = 32
num_of_labels = 20

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(num_of_labels)])

model.summary()

### Compile Model

In [27]:
# model compilation
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

###  Train Model
Fitting the model on our database and getting the results.

In [28]:
# training the model
epochs = 70

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                            patience=5,
                                            verbose=1)

history = model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=epochs,
                    callbacks=[callback])

Epoch 1/70
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0644 - loss: 2.9924 - val_accuracy: 0.1645 - val_loss: 2.9594
Epoch 2/70
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1368 - loss: 2.9399 - val_accuracy: 0.1310 - val_loss: 2.8747
Epoch 3/70
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2067 - loss: 2.8273 - val_accuracy: 0.2870 - val_loss: 2.7449
Epoch 4/70
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3068 - loss: 2.6851 - val_accuracy: 0.4445 - val_loss: 2.5931
Epoch 5/70
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.3988 - loss: 2.5235 - val_accuracy: 0.6530 - val_loss: 2.4107
Epoch 6/70
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5139 - loss: 2.3346 - val_accuracy: 0.7375 - val_loss: 2.2117
Epoch 7/70
[1m500/500[0m 

In [29]:
# printing loss and accuracy of the model on the test set
loss, accuracy = model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 380us/step - accuracy: 0.9741 - loss: 0.1136
Loss:  0.1366550475358963
Accuracy:  0.9722753167152405


#### Plot Loss and Accuracy of Model

In [30]:
# Extract training history
training_loss = history.history['loss']
training_accuracy = history.history['accuracy']
validation_loss = history.history['val_loss']
validation_accuracy = history.history['val_accuracy']
epochs = list(range(1, len(training_loss) + 1))  # Convert range to list

In [31]:
# Create subplots
fig = go.Figure()

# Add training loss plot
fig.add_trace(go.Scatter(x=epochs, y=training_loss, mode='lines+markers', name='Training Loss'))
# Add validation loss plot
fig.add_trace(go.Scatter(x=epochs, y=validation_loss, mode='lines+markers', name='Validation Loss'))

# Update layout
fig.update_layout(title='Training and Validation Metrics',
                  xaxis_title='Epochs',
                  yaxis_title='Value',
                  legend=dict(x=0.7, y=1),
                  template='plotly_dark')

# Show the plot
fig.show()

In [32]:
# Create subplots
fig = go.Figure()

# Add training accuracy plot
fig.add_trace(go.Scatter(x=epochs, y=training_accuracy, mode='lines+markers', name='Training Accuracy'))
# Add validation accuracy plot
fig.add_trace(go.Scatter(x=epochs, y=validation_accuracy, mode='lines+markers', name='Validation Accuracy'))

# Update layout
fig.update_layout(title='Training and Validation Metrics',
                  xaxis_title='Epochs',
                  yaxis_title='Value',
                  legend=dict(x=0.7, y=1),
                  template='plotly_dark')

# Show the plot
fig.show()

In [33]:
# creating the probability model for testing
probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])

# predicting test samples
predictions = probability_model.predict(raw_test_batch.map(vectorize_text))

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 822us/step


In [34]:
predicted_labels = []
for i in range(len(predictions)):
    predicted_labels.append(np.argmax(predictions[i]))

#### Confusion Matrix

In [35]:
las = np.array(test_df['label'].unique())
las = las.tolist()

In [36]:
# Compute the confusion matrix
cm = confusion_matrix(np.array(test_df['label_int']), np.array(predicted_labels))

# Create a heatmap for the confusion matrix using Plotly
fig = ff.create_annotated_heatmap(
    z=cm,
    x=las,
    y=las,
    colorscale='Blackbody'
)

# Update the layout
fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Label",
    yaxis_title="True Label",
    yaxis_autorange="reversed",
)

fig.show()

#### Classification Report

In [37]:
print(classification_report(np.array(test_df['label_int']), np.array(predicted_labels)))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        64
           1       0.99      0.92      0.95        83
           2       1.00      1.00      1.00        60
           3       1.00      1.00      1.00        58
           4       1.00      1.00      1.00        33
           5       1.00      1.00      1.00        14
           6       1.00      1.00      1.00        31
           7       0.96      1.00      0.98        66
           8       1.00      1.00      1.00        61
           9       0.88      1.00      0.93         7
          10       0.97      0.97      0.97        79
          11       0.99      0.97      0.98        80
          12       1.00      0.97      0.98        91
          13       1.00      0.97      0.99        71
          14       1.00      1.00      1.00        40
          16       1.00      1.00      1.00        18
          17       0.99      0.86      0.92        81
          18       0.78    

#### Check Model

In [38]:
# showing the first test sample result label
np.argmax(predictions[0])

8

In [39]:
# showing the true label of the first test sample
test_df.iloc[0]

text         eli ottoman safavieh color green material poly...
label                                                Furniture
label_int                                                    8
Name: 2727, dtype: object

In [40]:
import tensorflow as tf

# Assuming `model`, `vectorize_layer`, and `label_encoder` are already defined and trained

def predict_category(title):
    # Step 1: Preprocess the title
    title = tf.expand_dims(title, -1)  # Expand dims to match the model's expected input
    title_vectorized = vectorize_layer(title)  # Vectorize the input title
    
    # Step 2: Predict the category
    predictions = model.predict(title_vectorized)
    predicted_label = tf.argmax(predictions, axis=-1).numpy()[0]  # Get the predicted label index
    
    predicted_category = labels_names[predicted_label]
    
    return predicted_category

# Example usage:
custom_title = "stationery"
predicted_category = predict_category(custom_title)
print(f"Title: '{custom_title}' --> Predicted Category: {predicted_category}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Title: 'stationery' --> Predicted Category: Office Supplies
