In [42]:
import pandas as pd

## The training data for 'Is the Ad a Good Match to the Search Term' is given. It consists of 15 training instances. That means there are 15 rows. This dataset is small for a machine learning task. However, another dataset will be added during the qualification.

In [43]:
df= pd.read_csv('taac_assistant_taac.csv')

In [44]:
df.head()

Unnamed: 0,TaskId,User_Search_Term,Ad,Website,Relevance
0,1,wwww ncquickpass com,Nc Quick Pass - Pay Your Bill Online,www.doxo.com/pay/nc-quick-pass,Other
1,2,peloton plano tx,Studio Cycle Comparison - Find The Best Exerci...,www.nordictrack.com/Studio-Cycles/S22i,Other
2,3,antelope canyon,Hotels near Antelope Canyon - 100% Real Custom...,www.booking.com/Antelope-Canyon/Hotels,Other
3,4,get vaccine after covid,Janssen COVID-19 Vaccine - Authorized For Emer...,www.janssencovid19vaccine.com,Other
4,5,ahs.com/my-accountlogin,Find First american home warranty login - Chec...,www.searchandshopping.org/Your Search/Results,Other


In [87]:
#Converting the Relevance to numerical values to enable analysis
#Convert Relevance to numerical
mapping = {
    'Good' : 1,
    'Other' : 0,
}
df['Relevance'] = df['Relevance'].replace(mapping)
df.head(10)

Unnamed: 0,TaskId,User_Search_Term,Ad,Website,Relevance
0,1,wwww ncquickpass com,Nc Quick Pass - Pay Your Bill Online,www.doxo.com/pay/nc-quick-pass,0
1,2,peloton plano tx,Studio Cycle Comparison - Find The Best Exerci...,www.nordictrack.com/Studio-Cycles/S22i,0
2,3,antelope canyon,Hotels near Antelope Canyon - 100% Real Custom...,www.booking.com/Antelope-Canyon/Hotels,0
3,4,get vaccine after covid,Janssen COVID-19 Vaccine - Authorized For Emer...,www.janssencovid19vaccine.com,0
4,5,ahs.com/my-accountlogin,Find First american home warranty login - Chec...,www.searchandshopping.org/Your Search/Results,0
5,6,nike,Shop Womens Shops: Amazon - Amazon.com Officia...,www.amazon.com/apparel/womens-shops,1
6,7,cfl fixture,Flashlight Accessories,www.Grainfer.com/Flashlights,0
7,8,nationwide pet insurance,2021's Top 10 Pet Insurance - Buyer's Guide (N...,buyersguide.org/Pet-Insurance,1
8,9,nike,Nike Official Site - Just Do It - Shop The Lat...,www.nike.com,1
9,10,used cars,CarMax Used Cars - Visit carmax.com - Large Na...,www.carmax.com/cars,1


In [50]:
## Get the Independent Features

X=df.drop(columns =['TaskId', 'Relevance'])

In [51]:
## Get the Dependent features
y=df['Relevance']

In [52]:
y.value_counts()

1    8
0    7
Name: Relevance, dtype: int64

In [53]:
#It can be seen that the data is generally balanced

In [54]:
X.shape

(15, 3)

In [55]:
y.shape

(15,)

In [56]:
import tensorflow as tf

In [57]:
tf.__version__

'2.12.0'

In [58]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [59]:
### Vocabulary size
voc_size=5000

Onehot Representation

In [60]:
messages=X.copy()

In [61]:
messages['User_Search_Term'][1]

'peloton plano tx'

In [62]:
messages['Ad'][1]

'Studio Cycle Comparison - Find The Best Exercise Bike - NordicTrack Official Site'

In [63]:
messages.reset_index(inplace=True)

In [64]:
import nltk
import re
from nltk.corpus import stopwords

In [65]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [66]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    print(i)
    review = re.sub('[^a-zA-Z]', ' ', messages['User_Search_Term'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [67]:
corpus

['wwww ncquickpass com',
 'peloton plano tx',
 'antelop canyon',
 'get vaccin covid',
 'ah com accountlogin',
 'nike',
 'cfl fixtur',
 'nationwid pet insur',
 'nike',
 'use car',
 'car rental lubbock tx',
 'augusta tech adn',
 'hampton inn guntersvil al',
 'white strip',
 'florist laguna beach']

In [68]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[674, 3828, 2488],
 [3379, 3396, 3488],
 [2545, 3248],
 [4730, 3218, 3767],
 [4947, 2488, 780],
 [3093],
 [1528, 594],
 [280, 3432, 930],
 [3093],
 [1089, 553],
 [553, 2134, 2526, 3488],
 [4032, 3301, 4290],
 [3969, 1763, 4425, 4972],
 [4974, 506],
 [845, 2395, 1406]]

Embedding Representation

In [69]:
sent_length=10
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0    0    0    0  674 3828 2488]
 [   0    0    0    0    0    0    0 3379 3396 3488]
 [   0    0    0    0    0    0    0    0 2545 3248]
 [   0    0    0    0    0    0    0 4730 3218 3767]
 [   0    0    0    0    0    0    0 4947 2488  780]
 [   0    0    0    0    0    0    0    0    0 3093]
 [   0    0    0    0    0    0    0    0 1528  594]
 [   0    0    0    0    0    0    0  280 3432  930]
 [   0    0    0    0    0    0    0    0    0 3093]
 [   0    0    0    0    0    0    0    0 1089  553]
 [   0    0    0    0    0    0  553 2134 2526 3488]
 [   0    0    0    0    0    0    0 4032 3301 4290]
 [   0    0    0    0    0    0 3969 1763 4425 4972]
 [   0    0    0    0    0    0    0    0 4974  506]
 [   0    0    0    0    0    0    0  845 2395 1406]]


In [70]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,  674, 3828, 2488],
      dtype=int32)

In [71]:
## Creating model
embedding_vector_features=10
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(25))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 10, 10)            50000     
                                                                 
 lstm_2 (LSTM)               (None, 25)                3600      
                                                                 
 dense_2 (Dense)             (None, 1)                 26        
                                                                 
Total params: 53,626
Trainable params: 53,626
Non-trainable params: 0
_________________________________________________________________
None


In [72]:
## Creating model
embedding_vector_features=10
model1=Sequential()
model1.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model1.add(Bidirectional(LSTM(25)))
model1.add(Dropout(0.7))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model1.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 10, 10)            50000     
                                                                 
 bidirectional_1 (Bidirectio  (None, 50)               7200      
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 51        
                                                                 
Total params: 57,251
Trainable params: 57,251
Non-trainable params: 0
_________________________________________________________________
None


In [73]:
len(embedded_docs),y.shape

(15, (15,))

In [74]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(y)

In [75]:
X_final.shape,y_final.shape

((15, 10), (15,))

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

Model Training

In [77]:
### Finally Training
model1.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=8,batch_size=16)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7bcaa223cbe0>

Performance Metrics And Accuracy

In [78]:
y_pred1 = np.argmax(model1.predict(X_test),axis=1)



In [79]:
y_pred1

array([0, 0, 0, 0, 0])

In [80]:
from sklearn.metrics import confusion_matrix

In [82]:
confusion_matrix(y_test,y_pred1)

array([[2, 0],
       [3, 0]])

In [83]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred1)

0.4

In [84]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

           0       0.40      1.00      0.57         2
           1       0.00      0.00      0.00         3

    accuracy                           0.40         5
   macro avg       0.20      0.50      0.29         5
weighted avg       0.16      0.40      0.23         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
def predict_rel(predict_relevance):
    concatenated_text = ' '.join(predict_relevance)
    onehot_reprr = [one_hot(concatenated_text.lower(), voc_size)]
    padded = pad_sequences(onehot_reprr, maxlen=sent_length, padding='pre')
    return model1.predict(padded)

predict_relevance = ["nike", "Shop Womens Shops: Amazon - Amazon.com Officia...", "www.amazon.com/apparel/womens-shops"]
predicted_score = predict_rel(predict_relevance)




In [91]:
predicted_score

array([[0.5039908]], dtype=float32)