In [1]:
import pandas as pd

## The training data for 'Is the Ad a Good Match to the Search Term' is given. It consists of 15 training instances. That means there are 15 rows. This dataset is small for a machine learning task. However, another dataset will be added during the qualification.

In [13]:
df= pd.read_csv('taac_assistant_taac_1.csv')

In [None]:
df.head()

Unnamed: 0,TaskId,User_Search_Term,Ad,Website,Relevance
0,1,wwww ncquickpass com,Nc Quick Pass - Pay Your Bill Online,www.doxo.com/pay/nc-quick-pass,Other
1,2,peloton plano tx,Studio Cycle Comparison - Find The Best Exerci...,www.nordictrack.com/Studio-Cycles/S22i,Other
2,3,antelope canyon,Hotels near Antelope Canyon - 100% Real Custom...,www.booking.com/Antelope-Canyon/Hotels,Other
3,4,get vaccine after covid,Janssen COVID-19 Vaccine - Authorized For Emer...,www.janssencovid19vaccine.com,Other
4,5,ahs.com/my-accountlogin,Find First american home warranty login - Chec...,www.searchandshopping.org/Your Search/Results,Other


In [14]:
#Converting the Relevance to numerical values to enable analysis
#Convert Relevance to numerical
mapping = {
    'Good' : 1,
    'Other' : 0,
}
df['Relevance'] = df['Relevance'].replace(mapping)
df.head(10)

Unnamed: 0,TaskId,User_Search_Term,Ad,Website,Relevance
0,1,wwww ncquickpass com,Nc Quick Pass - Pay Your Bill Online,www.doxo.com/pay/nc-quick-pass,0
1,2,peloton plano tx,Studio Cycle Comparison - Find The Best Exerci...,www.nordictrack.com/Studio-Cycles/S22i,0
2,3,antelope canyon,Hotels near Antelope Canyon - 100% Real Custom...,www.booking.com/Antelope-Canyon/Hotels,0
3,4,get vaccine after covid,Janssen COVID-19 Vaccine - Authorized For Emer...,www.janssencovid19vaccine.com,0
4,5,ahs.com/my-accountlogin,Find First american home warranty login - Chec...,www.searchandshopping.org/Your Search/Results,0
5,6,nike,Shop Womens Shops: Amazon - Amazon.com Officia...,www.amazon.com/apparel/womens-shops,1
6,7,cfl fixture,Flashlight Accessories,www.Grainfer.com/Flashlights,0
7,8,nationwide pet insurance,2021's Top 10 Pet Insurance - Buyer's Guide (N...,buyersguide.org/Pet-Insurance,1
8,9,nike,Nike Official Site - Just Do It - Shop The Lat...,www.nike.com,1
9,10,used cars,CarMax Used Cars - Visit carmax.com - Large Na...,www.carmax.com/cars,1


In [15]:
#df.isna().value_counts()
# Check for missing values
df.isnull().sum().any()

False

In [16]:
# Identify rows with missing values
rows_with_missing_values = df[df.isnull().any(axis=1)]

# Print the rows with missing values
print("Rows with missing values:")
print(rows_with_missing_values)

Rows with missing values:
Empty DataFrame
Columns: [TaskId, User_Search_Term, Ad, Website, Relevance]
Index: []


In [17]:
## Get the Independent Features

X=df.drop(columns =['TaskId', 'Relevance'])

In [18]:
## Get the Dependent features
y=df['Relevance']

In [19]:
y.value_counts()

1    74
0    54
Name: Relevance, dtype: int64

In [20]:
#It can be seen that the data is generally balanced

In [21]:
X.shape

(128, 3)

In [22]:
y.shape

(128,)

In [23]:
import tensorflow as tf

In [24]:
tf.__version__

'2.13.0'

In [25]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [26]:
### Vocabulary size
voc_size=10

Onehot Representation

In [27]:
messages=X.copy()

In [28]:
messages['User_Search_Term'][1]

'peloton plano tx'

In [29]:
messages['Ad'][1]

'Studio Cycle Comparison - Find The Best Exercise Bike - NordicTrack Official Site'

In [30]:
messages.reset_index(inplace=True)

In [31]:
import nltk
import re
from nltk.corpus import stopwords

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [33]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['User_Search_Term'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127


In [34]:
corpus

['wwww ncquickpass com',
 'peloton plano tx',
 'antelop canyon',
 'get vaccin covid',
 'ah com accountlogin',
 'nike',
 'cfl fixtur',
 'nationwid pet insur',
 'nike',
 'use car',
 'car rental lubbock tx',
 'augusta tech adn',
 'hampton inn guntersvil al',
 'white strip',
 'florist laguna beach',
 'rachel ray nylon spoon',
 'dental mouthwash fix teeth',
 'june florist brooklyn',
 'buy marbl floor tile',
 'safeway com',
 'enterpris rent car',
 'flight atlanta fort sill ok',
 'virgin',
 'adob clean',
 'dialysi clinic hire near',
 'best place sell old playboy magazin',
 'comput viru',
 'hepat b',
 'social secur administr',
 'goldman sach platinum',
 'homegood com offici websit',
 'madam pompadour wallpap mural',
 'southwest airlin',
 'cowboy schedul',
 'realmushroom',
 'abbotsford hear center',
 'amazon',
 'aliana health',
 'bilater maxilari sinu diseas',
 'first time homebuy',
 'norton vpn',
 'world educ servic',
 'new ford f regular cab sale near',
 'creat weekli payrol calendar',
 'arch

In [35]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[1, 2, 8],
 [7, 1, 4],
 [8, 3],
 [2, 3, 5],
 [4, 8, 8],
 [6],
 [7, 8],
 [5, 4, 5],
 [6],
 [9, 7],
 [7, 4, 7, 4],
 [3, 8, 4],
 [4, 4, 9, 7],
 [2, 2],
 [3, 9, 9],
 [1, 6, 3, 2],
 [1, 9, 2, 3],
 [3, 3, 4],
 [9, 6, 8, 7],
 [1, 8],
 [7, 6, 7],
 [7, 5, 2, 6, 8],
 [6],
 [2, 6],
 [1, 4, 3, 6],
 [7, 7, 5, 9, 2, 9],
 [3, 1],
 [4, 7],
 [5, 4, 7],
 [3, 5, 3],
 [7, 8, 1, 2],
 [4, 1, 1, 3],
 [2, 6],
 [5, 6],
 [4],
 [2, 5, 3],
 [5],
 [6, 1],
 [9, 7, 7, 5],
 [5, 2, 7],
 [3, 1],
 [4, 3, 6],
 [5, 7, 4, 9, 6, 8, 6],
 [4, 2, 3, 3],
 [4, 7],
 [9, 8, 2, 6, 8],
 [8, 5, 7],
 [3, 7, 7],
 [4, 4, 5, 1],
 [6, 2, 9, 5],
 [2, 7, 1, 7],
 [7, 5, 4, 1, 9],
 [3, 3, 8, 6, 4],
 [1, 6, 8, 8],
 [7],
 [5, 3, 7],
 [5, 7, 7, 2],
 [7, 1, 4],
 [7, 4, 8, 3],
 [7, 8, 3],
 [5, 5],
 [9, 5],
 [5, 6, 4, 9, 1],
 [7, 5, 5, 5, 9, 2],
 [3, 8, 8, 1],
 [6, 7, 4],
 [7, 9, 6],
 [3, 5, 4],
 [9, 3],
 [7, 3],
 [3, 7, 6, 1],
 [7, 4, 1],
 [2, 3, 9, 6, 8],
 [8, 9, 3],
 [9, 5, 4],
 [5, 3, 7, 8, 9],
 [8, 1, 7, 7],
 [5, 9],
 [7, 6, 1, 3],
 [2, 7],
 

Embedding Representation

In [36]:
sent_length=10
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[0 0 0 ... 1 2 8]
 [0 0 0 ... 7 1 4]
 [0 0 0 ... 0 8 3]
 ...
 [0 0 0 ... 0 6 5]
 [0 0 0 ... 2 1 8]
 [0 0 0 ... 7 7 6]]


In [37]:
embedded_docs[0]

array([0, 0, 0, 0, 0, 0, 0, 1, 2, 8], dtype=int32)

In [38]:
## Creating model
embedding_vector_features=10
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(25))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 10)            100       
                                                                 
 lstm (LSTM)                 (None, 25)                3600      
                                                                 
 dense (Dense)               (None, 1)                 26        
                                                                 
Total params: 3726 (14.55 KB)
Trainable params: 3726 (14.55 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [39]:
## Creating model
embedding_vector_features=10
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(25)))
model1.add(Dropout(0.7))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 10, 10)            100       
                                                                 
 bidirectional (Bidirection  (None, 50)                7200      
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 7351 (28.71 KB)
Trainable params: 7351 (28.71 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [40]:
len(embedded_docs),y.shape

(128, (128,))

In [41]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [42]:
X_final.shape,y_final.shape

((128, 10), (128,))

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

Model Training

In [44]:
### Finally Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x781cbc9584f0>

Performance Metrics And Accuracy

In [45]:
y_pred1 = np.argmax(model1.predict(X_test),axis=1)



In [46]:
y_pred1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [47]:
from sklearn.metrics import confusion_matrix

In [48]:
confusion_matrix(y_test,y_pred1)

array([[15,  0],
       [28,  0]])

In [49]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)

0.3488372093023256

In [50]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

           0       0.35      1.00      0.52        15
           1       0.00      0.00      0.00        28

    accuracy                           0.35        43
   macro avg       0.17      0.50      0.26        43
weighted avg       0.12      0.35      0.18        43



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
def predict_rel(predict_relevance):
    concatenated_text = ' '.join(predict_relevance)
    onehot_reprr = [one_hot(concatenated_text.lower(), voc_size)]
    padded = pad_sequences(onehot_reprr, maxlen=sent_length, padding='pre')
    return model1.predict(padded)

predict_relevance = ["nike", "Shop Womens Shops: Amazon - Amazon.com Officia...", "www.amazon.com/apparel/womens-shops"]
predicted_score = predict_rel(predict_relevance)




In [52]:
predicted_score

array([[0.52938217]], dtype=float32)