In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import json
import gc
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from tensorflow.keras.utils import Sequence
import tensorflow as tf
from tensorflow.keras import callbacks
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Input , Dense , Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.
    
    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)


In [None]:

aggregated_search_data_path = '/content/drive/MyDrive/torob/output_data/aggregated_search_data.jsonl'
preprocessed_products_path = '/content/drive/MyDrive/torob/output_data/preprocessed_products.jsonl'
preprocessed_test_queries_path = '/content/drive/MyDrive/torob/output_data/preprocessed_test_queries.jsonl'

In [None]:
aggregated_searches_df = pd.DataFrame(read_json_lines(aggregated_search_data_path, n_lines=None))

In [None]:
products_data_df = pd.DataFrame(read_json_lines(preprocessed_products_path))

In [None]:
products_id_to_idx = dict(
    (p_id, idx)
    for idx, p_id in enumerate(products_data_df['id'])
)

In [None]:
products_data_df.head(2)

Unnamed: 0,id,title_normalized
0,1867826,میکروسکوپ مدل cgl 44121 سلسترون اپتیکی
1,419611,gt مدل mustang فورد مایستو ماشین موستانگ ford ...


In [None]:
def getDatasetSize(aggregated_searches_df , n_candidates = None):
  counter = 0
  for qid, agg_search in(enumerate(aggregated_searches_df.itertuples(index=False))):

    if n_candidates is None:
        limit = len(agg_search.results)
    else:
        limit = min(n_candidates, len(agg_search.results))
    clicks = dict(zip(agg_search.clicks, agg_search.clicks_count))

    for i, candidate_product_id in enumerate(agg_search.results[:limit]):
        if candidate_product_id is None:
            continue

        counter = counter + 1
  return counter

In [None]:
class DualChannelDataGenerator(Sequence):
    def __init__(self, dataset_size , query, doc , aggregated_searches_df , vectorsize , batch_size):
      self.dataset_size = dataset_size
      self.query = query
      self.doc = doc
      self.batch_size = batch_size
      self.aggregated_searches_df = aggregated_searches_df
      self.loop1 = 0
      self.loop2 = 0
      self.vectorsize = vectorsize
      self.n_candidates = None

    def __len__(self):
        return int(np.ceil(self.dataset_size/ float(self.batch_size)))

    def __getitem__(self , idx):
      a=np.zeros((self.batch_size, self.vectorsize) , dtype = float)
      b=np.zeros((self.batch_size, self.vectorsize) , dtype = float)
      batch_y = np.ones((self.batch_size))

      counter = 0
      loop = False
      loop11 = self.loop1
      loop22 = self.loop2
      for qid, agg_search in (enumerate(aggregated_searches_df[self.loop1:].itertuples(index=False))):
        if loop == True:
          break

        if self.n_candidates is None:
            limit = len(agg_search.results)
        else:
            limit = min(self.n_candidates, len(agg_search.results))
        clicks = dict(zip(agg_search.clicks, agg_search.clicks_count))

        for candidate_product_id in agg_search.results[:limit][self.loop2:]:

          if candidate_product_id is None:
                continue

          candidate_score = clicks.get(candidate_product_id, 0)
          candidate_score = np.log2(candidate_score + 1)

          loop22 = loop22 + 1
          counter = counter + 1
          if counter >= self.batch_size:
            loop = True
            self.loop2 = loop22
            self.loop1 = loop11
            break

          p_idx = products_id_to_idx[candidate_product_id]
          a[counter]=(query[qid])
          b[counter]=(doc[p_idx])
          batch_y[counter] = candidate_score 


        if loop == False:
          loop22 = 0
          loop11 = loop11 + 1

      return [a,b] , batch_y


In [None]:
MAX_FEATURES = 12000
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(products_data_df['title_normalized'])

In [None]:
doc = tokenizer.texts_to_sequences(products_data_df['title_normalized'])
query = tokenizer.texts_to_sequences(aggregated_searches_df['raw_query_normalized'])

In [None]:
del products_data_df

In [None]:
MAX_LENGTH = max(len(train_ex) for train_ex in doc)
doc = pad_sequences(doc, maxlen=MAX_LENGTH)
query = pad_sequences(query, maxlen=MAX_LENGTH)

In [None]:
# pickle.dump(tokenizer, open('/content/drive/MyDrive/torob/tokenizer.pkl', 'wb'))

In [None]:
MAX_LENGTH

301

In [None]:
sequences1 = Input(shape=(MAX_LENGTH,))
embedded1 = layers.Embedding(MAX_FEATURES, 32)(sequences1)
x1 = layers.Conv1D(64, 3, activation='relu')(embedded1)
x1 = layers.BatchNormalization()(x1)
x1 = layers.MaxPool1D(3)(x1)
x1 = layers.Conv1D(16, 5, activation='relu')(x1)
x1 = layers.GlobalMaxPool1D()(x1)
x1 = layers.Flatten()(x1)
x1 = layers.Dense(64, activation='relu')(x1)

In [None]:
sequences2 = Input(shape=(MAX_LENGTH,))
embedded2 = layers.Embedding(MAX_FEATURES, 32)(sequences2)
x2 = layers.Conv1D(64, 3, activation='relu')(embedded2)
x2 = layers.BatchNormalization()(x2)
x2 = layers.MaxPool1D(3)(x2)
x2 = layers.Conv1D(16, 5, activation='relu')(x2)
x2 = layers.GlobalMaxPool1D()(x2)
x2 = layers.Flatten()(x2)
x2 = layers.Dense(64, activation='relu')(x2)

In [None]:
merged = layers.concatenate([x1, x2])
dense1 = Dense(32, activation='relu')(merged)
dense1 = Dropout(0.1)(dense1)
dense1 = Dense(16, activation='relu')(dense1)
outputs = Dense(1, activation='sigmoid')(dense1)

In [None]:
model = Model(inputs=[sequences1, sequences2], outputs=outputs)

In [None]:
checkpoint = ModelCheckpoint(filepath='/content/drive/MyDrive/model_{epoch}.h5')

In [None]:
!pip install tensorflow-ranking

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-ranking
  Downloading tensorflow_ranking-0.5.2-py2.py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.4/150.4 KB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Collecting numpy==1.23.2
  Downloading numpy-1.23.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-serving-api<3.0.0,>=2.0.0
  Downloading tensorflow_serving_api-2.11.1-py2.py3-none-any.whl (37 kB)
Collecting tensorflow<3,>=2.11.1
  Downloading tensorflow-2.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (585.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m585.9/585.9 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting wrapt<1.15,>=1.11.0
  Downloading wrapt-1.14.1-cp

In [None]:
import tensorflow_ranking as tfr

In [None]:
model.compile(optimizer='adam', loss=tfr.keras.losses.ApproxNDCGLoss())

In [None]:
dataset_size = getDatasetSize(aggregated_searches_df)
vectorsize = doc.shape[1]
batch_size = 8192

In [None]:
train_generator = DualChannelDataGenerator( 
      dataset_size ,
      query,
      doc, 
      aggregated_searches_df , 
      vectorsize ,
      batch_size
      )

In [None]:
model.fit(train_generator, steps_per_epoch=len(train_generator), epochs=20,callbacks=[checkpoint])

Epoch 1/20


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Epoch 2/20
Epoch 3/20

KeyboardInterrupt: ignored

In [None]:
model.save('/content/drive/MyDrive/models/model.h5')

In [None]:
%reset -f

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import json
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.
    
    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)
aggregated_search_data_path = '/content/drive/MyDrive/torob/output_data/aggregated_search_data.jsonl'
preprocessed_products_path = '/content/drive/MyDrive/torob/output_data/preprocessed_products.jsonl'
preprocessed_test_queries_path = '/content/drive/MyDrive/torob/output_data/preprocessed_test_queries.jsonl'

In [None]:
query_normal = pd.DataFrame(read_json_lines('/content/drive/MyDrive/torob/data/test-offline-data_v1.jsonl'))

In [None]:
tokenizer = pickle.load(open('/content/drive/MyDrive/torob/tokenizer.pkl' , 'rb'))

In [None]:
products_data_df = pd.DataFrame(read_json_lines(preprocessed_products_path))

In [None]:
products_id_to_idx = dict(
    (p_id, idx)
    for idx, p_id in enumerate(products_data_df['id'])
)

In [None]:
doc = tokenizer.texts_to_sequences(products_data_df['title_normalized'])
query = tokenizer.texts_to_sequences(query_normal['raw_query'])

In [None]:
MAX_LENGTH = 301
doc = pad_sequences(doc, maxlen=MAX_LENGTH)
query = pad_sequences(query, maxlen=MAX_LENGTH)

In [None]:
def batch_generator(query_results , batch_size , query ,doc , products_id_to_idx):
  count = 0
  a = []
  b = []
  for qid, agg_search in (enumerate(query_results.itertuples(index=False))):
    for product_id in agg_search.result_not_ranked:
      if product_id is None:
            continue
      p_idx = products_id_to_idx[product_id]
      a.append(query[qid])
      b.append(doc[p_idx])
      count = count  + 1
      if(count>=batch_size):
        count = 0
        yield [a,b]
        a = []
        b = []

In [None]:
batch_size = 8196

In [None]:
counter = 0
for i in batch_generator(query_normal , batch_size ,query ,doc , products_id_to_idx):
  counter = counter + len(i[0])

In [None]:
counter

1344144

In [None]:
import numpy as np
from tensorflow.keras.models import load_model

In [None]:
model = load_model('/content/drive/MyDrive/models/model.h5')

In [None]:
arr = np.ones((counter))
batch = 0
coun = 0 
for i in batch_generator(query_normal , batch_size ,query ,doc , products_id_to_idx):
  test = i
  arr[batch:batch+batch_size] = model.predict([np.array(i[0]) ,  np.array(i[1])]).flatten()
  batch = batch + len(i[0])
  print(coun)
  coun = coun + 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163


In [None]:
a1 = []
b1 = []

In [None]:
count = 0

In [None]:
for qid, agg_search in (enumerate(query_normal.itertuples(index=False))):
  for product_id in agg_search.result_not_ranked:
    count = count  + 1
    if count < 1344144:
      continue
    if product_id is None:
          continue
    p_idx = products_id_to_idx[product_id]
    a1.append(query[qid])
    b1.append(doc[p_idx])


In [None]:
part1_pred = model.predict([np.array(a1) ,  np.array(b1)]).flatten()



In [None]:
pickle.dump(part1_pred , open('/content/drive/MyDrive/part1_pred.h5' , 'wb'))

In [None]:
pickle.dump(arr , open('/content/drive/MyDrive/predict.h5' , 'wb'))

In [None]:
%reset -f

In [None]:
import pandas as pd
import numpy as np
import pickle
import json

In [None]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.
    
    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)
aggregated_search_data_path = '/content/drive/MyDrive/torob/output_data/aggregated_search_data.jsonl'
preprocessed_products_path = '/content/drive/MyDrive/torob/output_data/preprocessed_products.jsonl'
preprocessed_test_queries_path = '/content/drive/MyDrive/torob/output_data/preprocessed_test_queries.jsonl'

In [None]:
test_data_df = query_normal = pd.DataFrame(read_json_lines('/content/drive/MyDrive/torob/data/test-offline-data_v1.jsonl'))

In [None]:
pred = pickle.load(open('/content/drive/MyDrive/predict.h5' , 'rb'))
part1_pred = pickle.load(open('/content/drive/MyDrive/part1_pred.h5' , 'rb'))

In [None]:
pred = np.concatenate((pred , part1_pred))

In [None]:
test_predictions = []

In [None]:
start_idx = 0
for test_candidates in  test_data_df['result_not_ranked']:
  preds_sample = pred[start_idx:start_idx + len(test_candidates)]
  sorted_idx = np.argsort(preds_sample)[::-1]
  sorted_candidates = [test_candidates[i] for i in sorted_idx]
  test_predictions.append(sorted_candidates)
  start_idx += len(test_candidates)

In [None]:
def write_test_predictions(predictions_path, predictions):
    lines = []
    for preds in predictions:
        lines.append(",".join([str(p_id) for p_id in preds]))

    with open(predictions_path, 'w') as f:
        f.write("\n".join(lines))

In [None]:
write_test_predictions('/content/predictions.txt', test_predictions)