# manipulation of data

## read and show data

In [130]:
import pandas as pd
data = pd.read_csv('data.txt')
data.head(10)

Unnamed: 0,Name of Product,Is Accepted Policy
0,Ultra-Thin Gaming Laptop with RGB Backlit Keyb...,Yes
1,Deluxe Espresso Machine with Integrated Milk F...,Yes
2,Advanced Running Shoes with Responsive Cushion...,Yes
3,Latest iPhone Pro Max with Triple-Camera Syste...,Yes
4,Next-Gen Gaming Console with 4K Graphics and E...,Yes
5,Captivating Romantic Novel Set in the Enchanti...,No
6,Realistic Toy Sniper Rifle for Outdoor Shootin...,No
7,Breathtaking Action Movie Marathon in 4K Ultra...,Yes
8,Organic Herbal Infusion Collection for Relaxat...,No
9,Smart Fitness Tracker with ECG and Blood Oxyge...,Yes


## describe data

In [131]:
data.describe()

Unnamed: 0,Name of Product,Is Accepted Policy
count,3150,3150
unique,2342,3
top,Dearjoyee sexy dolls for men silicon xxx-18-ww...,No
freq,78,1667


In [132]:
print("unique values in Is Accepted Policy",data["Is Accepted Policy"].unique())

unique values in Is Accepted Policy ['Yes' 'No' 'Is Accepted Policy']


In [133]:
print('the data with No in Is Accepted Policy',len(data[data["Is Accepted Policy"]=="No"]))
print('the data with Yes in Is Accepted Policy',len(data[data["Is Accepted Policy"]=="Yes"]))

the data with No in Is Accepted Policy 1667
the data with Yes in Is Accepted Policy 1474




## delete all data with Accepted Policy = "Is Accepted Policy"





In [134]:
data = data[data["Is Accepted Policy"]!="Is Accepted Policy"]

In [135]:
print("unique values in Is Accepted Policy",data["Is Accepted Policy"].unique())

unique values in Is Accepted Policy ['Yes' 'No']


## add new column describe the prodact policy state numericly

In [136]:
data.loc[:,"agree value"] = data['Is Accepted Policy'].apply(lambda x: 1 if str(x) == "Yes" else 0)

In [137]:
data.head(10)

Unnamed: 0,Name of Product,Is Accepted Policy,agree value
0,Ultra-Thin Gaming Laptop with RGB Backlit Keyb...,Yes,1
1,Deluxe Espresso Machine with Integrated Milk F...,Yes,1
2,Advanced Running Shoes with Responsive Cushion...,Yes,1
3,Latest iPhone Pro Max with Triple-Camera Syste...,Yes,1
4,Next-Gen Gaming Console with 4K Graphics and E...,Yes,1
5,Captivating Romantic Novel Set in the Enchanti...,No,0
6,Realistic Toy Sniper Rifle for Outdoor Shootin...,No,0
7,Breathtaking Action Movie Marathon in 4K Ultra...,Yes,1
8,Organic Herbal Infusion Collection for Relaxat...,No,0
9,Smart Fitness Tracker with ECG and Blood Oxyge...,Yes,1


## show deplicated values

In [138]:
print("all titles of products",len(data['Name of Product']))

all titles of products 3141


In [139]:
print('unique titles of product',len(data['Name of Product'].unique()))

unique titles of product 2341


In [140]:
print("i have" ,len(data['Name of Product']) - len(data['Name of Product'].unique()), 'repeted value')

i have 800 repeted value


In [141]:
print(list(data['Name of Product'])[355])
print(list(data['Name of Product'])[27])

Comprehensive Home Gym Equipment Set for Total Body Fitness
Comprehensive Home Gym Equipment Set for Total Body Fitness


## delete all repeted values

In [142]:
data_no_duplicates = data.drop_duplicates(subset='Name of Product', keep='first')
# Print the length of the new DataFrame to confirm the removal of duplicates
print("Length after removing duplicates:", len(data_no_duplicates))

Length after removing duplicates: 2341


In [143]:
data = data_no_duplicates

In [144]:
print('the data with No in Is Accepted Policy',len(data[data["Is Accepted Policy"]=="No"]))
print('the data with Yes in Is Accepted Policy',len(data[data["Is Accepted Policy"]=="Yes"]))

the data with No in Is Accepted Policy 1042
the data with Yes in Is Accepted Policy 1299


# Text Preprocessing

## create stemmer function

In [145]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # Download the punkt tokenizer if you haven't already

def stem_text(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    stemmed_text = ' '.join(stemmed_tokens)
    return stemmed_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## stemme all texts

In [146]:
data['StemmedText'] = data['Name of Product'].apply(stem_text)
data

Unnamed: 0,Name of Product,Is Accepted Policy,agree value,StemmedText
0,Ultra-Thin Gaming Laptop with RGB Backlit Keyb...,Yes,1,ultra-thin game laptop with rgb backlit keyboa...
1,Deluxe Espresso Machine with Integrated Milk F...,Yes,1,delux espresso machin with integr milk frother...
2,Advanced Running Shoes with Responsive Cushion...,Yes,1,advanc run shoe with respons cushion for optim...
3,Latest iPhone Pro Max with Triple-Camera Syste...,Yes,1,latest iphon pro max with triple-camera system...
4,Next-Gen Gaming Console with 4K Graphics and E...,Yes,1,next-gen game consol with 4k graphic and exclu...
...,...,...,...,...
3085,LiBa Shower Curtain Liners,Yes,1,liba shower curtain liner
3086,Dremel rotary. pet. oscillating tools and more,Yes,1,dremel rotari . pet . oscil tool and more
3087,KURONO Stationary Exercise Bike for Home Worko...,Yes,1,kurono stationari exercis bike for home workou...
3088,Bekind Apex 2-in-1 Hair Straightener Flat Iron...,Yes,1,bekind apex 2-in-1 hair straighten flat iron ....


## Tokenize Texts

In [150]:
from tensorflow.keras.preprocessing.text import Tokenizer

texts = data['StemmedText'].tolist()
labels = data['agree value'].tolist()
MAX_NUM_WORDS=280

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6101 unique tokens.


## select the fixed-length input sequences

In [151]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 30

X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
X

array([[  0,   0,   0, ...,   4,  26, 235],
       [  0,   0,   0, ...,  83,   2,   4],
       [  0,   0,   0, ..., 159,   2,   1],
       ...,
       [  0,   0,   0, ...,  39, 263,   1],
       [  0,   0,   0, ...,   1, 177,   9],
       [  0,   0,   0, ..., 105,   4,  98]])

## categorize the output

In [152]:
import numpy as np
from tensorflow.keras.utils import to_categorical

y = to_categorical(np.asarray(labels))

In [153]:
y[0:8]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

## split data to train & test

In [154]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create our LSTM Model

In [155]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Model, Sequential

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [156]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, embed_dim, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 30, 128)           35840     
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 30, 128)          0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 196)               254800    
                                                                 
 dense_2 (Dense)             (None, 2)                 394       
                                                                 
Total params: 291,034
Trainable params: 291,034
Non-trainable params: 0
_________________________________________________________________
None


In [157]:
batch_size = 32
epochs = 5
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch_size, verbose=1, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x270aaa7f730>

## Transform Extracted Dataset

In [158]:
labels_legend = {'No': 0, 'Yes': 1}
labels_legend_inverted = {f"{v}":k for k,v in labels_legend.items()}

training_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'max_words':MAX_SEQUENCE_LENGTH,# MAX_NUM_WORDS,
    'max_sequence': MAX_SEQUENCE_LENGTH,
    'legend': labels_legend,
    'labels_legend_inverted': labels_legend_inverted,
    "tokenizer": tokenizer,
}
X_test = training_data['X_test']
X_train = training_data['X_train']
y_test = training_data['y_test']
y_train = training_data['y_train']
labels_legend_inverted = training_data['labels_legend_inverted']
legend = training_data['legend']
max_sequence = training_data['max_sequence']
max_words = training_data['max_words']
tokenizer = training_data['tokenizer']

## Predict new data

In [159]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict(text_str, max_sequence=280, tokenizer=None, model=None, labels_legend_inverted=None):
    if not tokenizer or not model or not labels_legend_inverted:
        return None
    
    #stemming the input text
    text_str = stem_text(text_str)
    
    # Tokenize the input text
    sequences = tokenizer.texts_to_sequences([text_str])
    
    # Pad the sequence
    x_input = pad_sequences(sequences, maxlen=max_sequence)
    
    # Predict using the model
    y_output = model.predict(x_input)
    
    # Assuming you want to get the label with the highest probability
    top_y_index = np.argmax(y_output, axis=-1)[0]
    preds = y_output[0][top_y_index]
    
    labeled_preds = {labels_legend_inverted[str(top_y_index)]: float(preds)}
    
    return labels_legend_inverted[str(top_y_index)],labeled_preds




In [166]:
# Example usage
msg="best drugs 5ml"
y_pred,prediction = predict(msg ,max_sequence=max_sequence, tokenizer=tokenizer, model=model, labels_legend_inverted=labels_legend_inverted)
print(msg,'-------------->',y_pred,prediction)


msg="gun to kill your enemy"
y_pred,prediction = predict(msg ,max_sequence=max_sequence, tokenizer=tokenizer, model=model, labels_legend_inverted=labels_legend_inverted)
print(msg,'-------------->',y_pred,prediction)

msg="pc gamer"
y_pred,prediction = predict(msg ,max_sequence=max_sequence, tokenizer=tokenizer, model=model, labels_legend_inverted=labels_legend_inverted)
print(msg,'-------------->',y_pred,prediction)


best drugs 5ml --------------> No {'No': 0.974354088306427}
gun to kill your enemy --------------> No {'No': 0.6810073256492615}
pc gamer --------------> Yes {'Yes': 0.8236507177352905}


## test the model

In [167]:
X = data["Name of Product"].tolist()
y = data["Is Accepted Policy"].tolist()

In [168]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [104]:
true = 0
false = 0
for i in range(len(X_test)):
    y_pred,prediction = predict(X_test[i], tokenizer=tokenizer, model=model, labels_legend_inverted=labels_legend_inverted)

    if y_test[i] == y_pred:
        true += 1
    else:
        false += 1













In [105]:
print("true : ",true)
print("false : ",false)

true :  690
false :  83


In [106]:
print("True :",f'{100*true/(true+false):.2f}%')
print("False :",f'{100*false/(true+false):.2f}%')

True : 89.26%
False : 10.74%
