In [56]:
import pandas as pd

## The training data for 'Is the Ad a Good Match to the Search Term' is given. It consists of 15 training instances. That means there are 15 rows. This dataset is small for a machine learning task. However, another dataset will be added during the qualification.

In [57]:
df= pd.read_csv('taac_assistant_taac_7.csv')

In [58]:
df.head()

Unnamed: 0,TaskId,User_Search_Term,Ad,Website,Relevance
0,1,wwww ncquickpass com,Nc Quick Pass - Pay Your Bill Online,www.doxo.com/pay/nc-quick-pass,Other
1,2,peloton plano tx,Studio Cycle Comparison - Find The Best Exerci...,www.nordictrack.com/Studio-Cycles/S22i,Other
2,3,antelope canyon,Hotels near Antelope Canyon - 100% Real Custom...,www.booking.com/Antelope-Canyon/Hotels,Other
3,4,get vaccine after covid,Janssen COVID-19 Vaccine - Authorized For Emer...,www.janssencovid19vaccine.com,Other
4,5,ahs.com/my-accountlogin,Find First american home warranty login - Chec...,www.searchandshopping.org/Your Search/Results,Other


In [59]:
#Converting the Relevance to numerical values to enable analysis
#Convert Relevance to numerical
mapping = {
    'Good' : 1,
    'Other' : 0,
}
df['Relevance'] = df['Relevance'].replace(mapping)
df.head(10)

Unnamed: 0,TaskId,User_Search_Term,Ad,Website,Relevance
0,1,wwww ncquickpass com,Nc Quick Pass - Pay Your Bill Online,www.doxo.com/pay/nc-quick-pass,0
1,2,peloton plano tx,Studio Cycle Comparison - Find The Best Exerci...,www.nordictrack.com/Studio-Cycles/S22i,0
2,3,antelope canyon,Hotels near Antelope Canyon - 100% Real Custom...,www.booking.com/Antelope-Canyon/Hotels,0
3,4,get vaccine after covid,Janssen COVID-19 Vaccine - Authorized For Emer...,www.janssencovid19vaccine.com,0
4,5,ahs.com/my-accountlogin,Find First american home warranty login - Chec...,www.searchandshopping.org/Your Search/Results,0
5,6,nike,Shop Womens Shops: Amazon - Amazon.com Officia...,www.amazon.com/apparel/womens-shops,1
6,7,cfl fixture,Flashlight Accessories,www.Grainfer.com/Flashlights,0
7,8,nationwide pet insurance,2021's Top 10 Pet Insurance - Buyer's Guide (N...,buyersguide.org/Pet-Insurance,1
8,9,nike,Nike Official Site - Just Do It - Shop The Lat...,www.nike.com,1
9,10,used cars,CarMax Used Cars - Visit carmax.com - Large Na...,www.carmax.com/cars,1


In [60]:
#df.isna().value_counts()
# Check for missing values
df.isnull().sum().any()

False

In [61]:
# Identify rows with missing values
rows_with_missing_values = df[df.isnull().any(axis=1)]

# Print the rows with missing values
print("Rows with missing values:")
print(rows_with_missing_values)

Rows with missing values:
Empty DataFrame
Columns: [TaskId, User_Search_Term, Ad, Website, Relevance]
Index: []


In [62]:
## Get the Independent Features

X=df.drop(columns =['TaskId', 'Relevance'])

In [63]:
## Get the Dependent features
y=df['Relevance']

In [64]:
y.value_counts()

1    581
0    395
Name: Relevance, dtype: int64

In [65]:
#It can be seen that the data is generally balanced

In [66]:
X.shape

(976, 3)

In [67]:
y.shape

(976,)

In [68]:
pip install --upgrade tensorflow



In [69]:
import tensorflow as tf

In [70]:
tf.__version__

'2.15.0'

In [71]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [72]:
### Vocabulary size
voc_size=1000

Onehot Representation

In [73]:
messages=X.copy()

In [74]:
messages['User_Search_Term'][1]

'peloton plano tx'

In [75]:
messages['Ad'][1]

'Studio Cycle Comparison - Find The Best Exercise Bike - NordicTrack Official Site'

In [76]:
messages.reset_index(inplace=True)

In [77]:
import nltk
import re
from nltk.corpus import stopwords

In [78]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [79]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['User_Search_Term'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [80]:
corpus

['wwww ncquickpass com',
 'peloton plano tx',
 'antelop canyon',
 'get vaccin covid',
 'ah com accountlogin',
 'nike',
 'cfl fixtur',
 'nationwid pet insur',
 'nike',
 'use car',
 'car rental lubbock tx',
 'augusta tech adn',
 'hampton inn guntersvil al',
 'white strip',
 'florist laguna beach',
 'rachel ray nylon spoon',
 'dental mouthwash fix teeth',
 'june florist brooklyn',
 'buy marbl floor tile',
 'safeway com',
 'enterpris rent car',
 'flight atlanta fort sill ok',
 'virgin',
 'adob clean',
 'dialysi clinic hire near',
 'best place sell old playboy magazin',
 'comput viru',
 'hepat b',
 'social secur administr',
 'goldman sach platinum',
 'homegood com offici websit',
 'madam pompadour wallpap mural',
 'southwest airlin',
 'cowboy schedul',
 'realmushroom',
 'abbotsford hear center',
 'amazon',
 'aliana health',
 'bilater maxilari sinu diseas',
 'first time homebuy',
 'norton vpn',
 'world educ servic',
 'new ford f regular cab sale near',
 'creat weekli payrol calendar',
 'arch

In [81]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[928, 335, 831],
 [384, 253, 203],
 [919, 737],
 [476, 193, 3],
 [356, 831, 186],
 [762],
 [204, 480],
 [310, 374, 281],
 [762],
 [386, 99],
 [99, 673, 193, 203],
 [392, 369, 913],
 [611, 344, 920, 934],
 [211, 502],
 [622, 46, 475],
 [522, 767, 111, 380],
 [409, 136, 232, 848],
 [619, 622, 352],
 [275, 381, 248, 994],
 [547, 831],
 [569, 432, 99],
 [584, 608, 632, 652, 919],
 [860],
 [805, 699],
 [917, 540, 735, 561],
 [379, 847, 665, 498, 577, 49],
 [485, 328],
 [843, 45],
 [825, 771, 364],
 [163, 979, 280],
 [377, 831, 708, 5],
 [15, 640, 835, 98],
 [924, 164],
 [578, 749],
 [339],
 [929, 881, 864],
 [724],
 [557, 207],
 [222, 781, 732, 558],
 [277, 455, 945],
 [787, 243],
 [510, 210, 471],
 [303, 532, 502, 209, 269, 350, 561],
 [500, 939, 342, 803],
 [522, 752],
 [367, 993, 418, 471, 275],
 [612, 846, 994],
 [960, 947, 841],
 [21, 482, 185, 498],
 [86, 728, 105, 324],
 [902, 89, 280, 626],
 [576, 492, 696, 683, 174],
 [412, 989, 261, 794, 40],
 [793, 699, 931, 248],
 [847],
 [224,

Embedding Representation

In [82]:
sent_length=10
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[  0   0   0 ... 928 335 831]
 [  0   0   0 ... 384 253 203]
 [  0   0   0 ...   0 919 737]
 ...
 [  0   0   0 ... 231 294 293]
 [  0   0   0 ...   0 530 164]
 [  0   0   0 ...   0 131 654]]


In [83]:
embedded_docs[0]

array([  0,   0,   0,   0,   0,   0,   0, 928, 335, 831], dtype=int32)

In [84]:
## Creating model
embedding_vector_features=10
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 10, 10)            10000     
                                                                 
 lstm_2 (LSTM)               (None, 5)                 320       
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 10326 (40.34 KB)
Trainable params: 10326 (40.34 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [85]:
## Creating model
embedding_vector_features=10
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(5)))
model1.add(Dropout(0.7))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 10, 10)            10000     
                                                                 
 bidirectional_1 (Bidirecti  (None, 10)                640       
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 10)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 10651 (41.61 KB)
Trainable params: 10651 (41.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [86]:
len(embedded_docs),y.shape

(976, (976,))

In [87]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [88]:
X_final.shape,y_final.shape

((976, 10), (976,))

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.30, random_state=42)

Model Training

In [90]:
### Finally Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e410a614df0>

Performance Metrics And Accuracy

In [91]:
y_pred1 = np.argmax(model1.predict(X_test),axis=1)



In [92]:
from sklearn.metrics import f1_score
best_threshold = 0.5
best_f1_score = 0

for threshold in np.arange(0.1, 1, 0.05):
    y_pred1 = (model1.predict(X_test) > threshold).astype(int)
    f1 = f1_score(y_test, y_pred1)
    if f1 > best_f1_score:
        best_threshold = threshold
        best_f1_score = f1

print("Best Threshold:", best_threshold)


Best Threshold: 0.5000000000000001


In [93]:
threshold = 0.5  # Adjust the threshold as needed
y_pred1 = (model1.predict(X_test) > threshold).astype(int)



In [94]:
y_pred1

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
    

In [95]:
from sklearn.metrics import confusion_matrix

In [96]:
confusion_matrix(y_test,y_pred1)

array([[ 18, 106],
       [  3, 166]])

In [97]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)

0.6279863481228669

In [98]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

           0       0.86      0.15      0.25       124
           1       0.61      0.98      0.75       169

    accuracy                           0.63       293
   macro avg       0.73      0.56      0.50       293
weighted avg       0.71      0.63      0.54       293



In [99]:
def predict_rel(predict_relevance):
    concatenated_text = ' '.join(predict_relevance)
    onehot_reprr = [one_hot(concatenated_text.lower(), voc_size)]
    padded = pad_sequences(onehot_reprr, maxlen=sent_length, padding='pre')
    return model1.predict(padded)

predict_relevance = ["nike", "Shop Womens Shops: Amazon - Amazon.com Officia...", "www.amazon.com/apparel/womens-shops"]
predicted_score = predict_rel(predict_relevance)




In [100]:
predicted_score

array([[0.41775435]], dtype=float32)

In [101]:
pip install streamlit



In [102]:
import streamlit as st

Saving The Trained Model

In [103]:
# Assuming 'model' is your trained Keras model
model.save('trained_model_1.h5')


  saving_api.save_model(


In [104]:
pip install keras




In [105]:
from keras.models import load_model

# Provide the correct file path to your saved model
model_filepath = 'trained_model_1.h5'

# Load the saved model
loaded_model = load_model(model_filepath)


In [106]:
# Load the saved model
#with open('trained_model.h5', 'rb') as model_file:
    #loaded_model = load_model(model_file)

# Define the text data you want to predict
predict_relevance = ['jaguar usa', 'Visit Jaguar Dealership - Jaguar Dealer - West Houston Jaguar', 'www.jaguarwesthouston.com']

# Call the predict_rel function using the loaded model
predicted_score = predict_rel(predict_relevance)
print(predicted_score)


[[0.646376]]
