## load al files from model folder

In [15]:
model_folder = "model"

## Load model

In [16]:
from tensorflow import keras

model = keras.models.load_model(f'{model_folder}/model_v2.h5')

## Load tokenizer

In [17]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

# Load tokenizer configuration from the file
with open(f'{model_folder}/tokenizer_config.json', 'r') as json_file:
    tokenizer_config_str = json_file.read()

# Create a tokenizer instance using tokenizer_from_json
tokenizer = tokenizer_from_json(tokenizer_config_str)


## Load the training data

In [18]:
import pickle

# Specify the file path where you saved the data
pickle_file_path = f'{model_folder}/training_data.pkl'

# Load the training_data dictionary from the Pickle file
with open(pickle_file_path, 'rb') as pickle_file:
    loaded_training_data = pickle.load(pickle_file)

# Access the loaded data
max_words = loaded_training_data['max_words']
max_sequence = loaded_training_data['max_sequence']
legend = loaded_training_data['legend']
labels_legend_inverted = loaded_training_data['labels_legend_inverted']

## define the stem function

In [19]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # Download the punkt tokenizer if you haven't already

def stem_text(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    stemmed_text = ' '.join(stemmed_tokens)
    return stemmed_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## define the pridection function

In [20]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict(text_str, max_sequence=30, tokenizer=None, model=None, labels_legend_inverted=None):
    if not tokenizer or not model or not labels_legend_inverted:
        return None
    
    #stemming the input text
    text_str = stem_text(text_str)
    
    # Tokenize the input text
    sequences = tokenizer.texts_to_sequences([text_str])
    
    # Pad the sequence
    x_input = pad_sequences(sequences, maxlen=max_sequence)
    
    # Predict using the model
    y_output = model.predict(x_input,verbose=0)
    
    # Assuming you want to get the label with the highest probability
    top_y_index = np.argmax(y_output, axis=-1)[0]
    preds = y_output[0][top_y_index]
    
    labeled_preds = {labels_legend_inverted[str(top_y_index)]: float(preds)}
    
    return labels_legend_inverted[str(top_y_index)],labeled_preds




## Example usage

In [37]:
# Example usage
msg="best drugs 5ml"
y_pred,prediction = predict(msg, tokenizer=tokenizer, model=model, labels_legend_inverted=labels_legend_inverted)
print(msg,'-------------->',y_pred,prediction)


msg="gun to kill your enemy"
y_pred,prediction = predict(msg , tokenizer=tokenizer, model=model, labels_legend_inverted=labels_legend_inverted)
print(msg,'-------------->',y_pred,prediction)

msg="pc gamer"
y_pred,prediction = predict(msg, tokenizer=tokenizer, model=model, labels_legend_inverted=labels_legend_inverted)
print(msg,'-------------->',y_pred,prediction)


best drugs 5ml --------------> No {'No': 0.9997128844261169}
gun to kill your enemy --------------> No {'No': 0.9370914101600647}
pc gamer --------------> Yes {'Yes': 0.9720265865325928}


## test the model

In [38]:
import pandas as pd
data = pd.read_csv('data.txt')

data = data[data["Is Accepted Policy"]!="Is Accepted Policy"]
data = data.drop_duplicates(subset='Name of Product', keep='first')

data

Unnamed: 0,Name of Product,Is Accepted Policy
0,Ultra-Thin Gaming Laptop with RGB Backlit Keyb...,Yes
1,Deluxe Espresso Machine with Integrated Milk F...,Yes
2,Advanced Running Shoes with Responsive Cushion...,Yes
3,Latest iPhone Pro Max with Triple-Camera Syste...,Yes
4,Next-Gen Gaming Console with 4K Graphics and E...,Yes
...,...,...
3085,LiBa Shower Curtain Liners,Yes
3086,Dremel rotary. pet. oscillating tools and more,Yes
3087,KURONO Stationary Exercise Bike for Home Worko...,Yes
3088,Bekind Apex 2-in-1 Hair Straightener Flat Iron...,Yes


In [39]:
from sklearn.model_selection import train_test_split

X = data["Name of Product"].tolist()
y = data["Is Accepted Policy"].tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [40]:
from sklearn.metrics import classification_report

# Function to convert 'Yes' and 'No' to binary labels (1 and 0)
def convert_labels(label):
    return 1 if label == 'Yes' else 0

# Convert 'Is Accepted Policy' to binary labels for y_test
y_test_binary = [convert_labels(label) for label in y_test]

# Initialize lists to store predicted labels
y_pred_binary = []

# Predict labels for X_test
for i in range(len(X_test)):
    y_pred, _ = predict(X_test[i], tokenizer=tokenizer, model=model, labels_legend_inverted=labels_legend_inverted)
    y_pred_binary.append(convert_labels(y_pred))

# Calculate and print classification report
print("Classification Report:")
print(classification_report(y_test_binary, y_pred_binary))


Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.45      0.60       343
           1       0.69      0.96      0.80       430

    accuracy                           0.73       773
   macro avg       0.79      0.70      0.70       773
weighted avg       0.78      0.73      0.71       773

