<a href="https://colab.research.google.com/github/younesabdolmalaky/LTR-on-torob-data/blob/main/notebooks/RankNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import json
import gc
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from tensorflow.keras.utils import Sequence
import tensorflow as tf
from tensorflow.keras import callbacks
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Input , Dense , Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.callbacks import ModelCheckpoint


In [3]:
def read_json_lines(path, n_lines=None):
    """Creates a generator which reads and returns lines of
    a json lines file, one line at a time, each as a dictionary.
    
    This could be used as a memory-efficient alternative of `pandas.read_json`
    for reading a json lines file.
    """
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if n_lines == i:
                break
            yield json.loads(line)


In [4]:
aggregated_search_data_path = '/content/drive/MyDrive/torob/output_data/aggregated_search_data.jsonl'
preprocessed_products_path = '/content/drive/MyDrive/torob/output_data/preprocessed_products.jsonl'
preprocessed_test_queries_path = '/content/drive/MyDrive/torob/output_data/preprocessed_test_queries.jsonl'

In [5]:
aggregated_searches_df = pd.DataFrame(read_json_lines(aggregated_search_data_path, n_lines=None))
products_data_df = pd.DataFrame(read_json_lines(preprocessed_products_path))
test_offline_queries_df = pd.DataFrame(read_json_lines(preprocessed_test_queries_path))

In [6]:
products_id_to_idx = dict(
    (p_id, idx)
    for idx, p_id in enumerate(products_data_df['id'])
)

In [7]:
def getDatasetSize(aggregated_searches_df , n_candidates = None):
  counter = 0
  for qid, agg_search in(enumerate(aggregated_searches_df.itertuples(index=False))):

    if n_candidates is None:
        limit = len(agg_search.results)
    else:
        limit = min(n_candidates, len(agg_search.results))
    clicks = dict(zip(agg_search.clicks, agg_search.clicks_count))

    for i, candidate_product_id in enumerate(agg_search.results[:limit]):
        if candidate_product_id is None:
            continue

        counter = counter + 1
  return counter

In [38]:
class DualChannelDataGenerator(Sequence):
    def __init__(self, dataset_size , query, doc , aggregated_searches_df , vectorsize , batch_size):
      self.dataset_size = dataset_size
      self.query = query
      self.doc = doc
      self.batch_size = batch_size
      self.aggregated_searches_df = aggregated_searches_df
      self.loop1 = 0
      self.loop2 = 0
      self.vectorsize = vectorsize
      self.n_candidates = None
      self.random_projection_mat = np.random.rand(8196, 256)
      np.save('/content/drive/MyDrive/torob/Features/rand.npy', self.random_projection_mat , allow_pickle=True)



    def __len__(self):
        return int(np.ceil(self.dataset_size/ float(self.batch_size)))

    def __getitem__(self , idx):
      a=np.zeros((self.batch_size, self.vectorsize) , dtype = float)
      b=np.zeros((self.batch_size, self.vectorsize) , dtype = float)
      batch_y = np.ones((self.batch_size))

      counter = 0
      loop = False
      loop11 = self.loop1
      loop22 = self.loop2
      for qid, agg_search in (enumerate(aggregated_searches_df[self.loop1:].itertuples(index=False))):
        if loop == True:
          break

        if self.n_candidates is None:
            limit = len(agg_search.results)
        else:
            limit = min(self.n_candidates, len(agg_search.results))
        clicks = dict(zip(agg_search.clicks, agg_search.clicks_count))

        for candidate_product_id in agg_search.results[:limit][self.loop2:]:

          if candidate_product_id is None:
                continue

          candidate_score = clicks.get(candidate_product_id, 0)
          candidate_score = np.log2(candidate_score + 1)

          loop22 = loop22 + 1
          counter = counter + 1
          if counter >= self.batch_size:
            loop = True
            self.loop2 = loop22
            self.loop1 = loop11
            break

          p_idx = products_id_to_idx[candidate_product_id]
          a[counter]=(query[qid]).toarray()
          b[counter]=(doc[p_idx]).toarray()
          batch_y[counter] = candidate_score 


        if loop == False:
          loop22 = 0
          loop11 = loop11 + 1
      return np.hstack((a.dot(self.random_projection_mat),b.dot(self.random_projection_mat))) , batch_y


In [39]:
input = tf.keras.layers.Input(shape=(512,), name='input')
x1 = tf.keras.layers.Dense(1024, activation='relu')(input)
x1 = tf.keras.layers.Dense(2048, activation='relu')(x1)
output = tf.keras.layers.Dense(1, activation='sigmoid')(x1)

In [40]:
model = tf.keras.Model(inputs=input, outputs=output)

In [41]:
checkpoint = ModelCheckpoint(filepath='/content/drive/MyDrive/model_{epoch}.h5')

In [42]:
def ranknet_loss(y_true, y_pred):
    y_true = 2 * y_true - 1

    pairwise_diff = tf.expand_dims(y_true, axis=1) - tf.expand_dims(y_true, axis=0)

    pairwise_logits = tf.expand_dims(y_pred, axis=1) - tf.expand_dims(y_pred, axis=0)

    mask = tf.cast(tf.not_equal(pairwise_diff, 0), tf.float32)
    pairwise_logits = pairwise_logits * mask

    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.sigmoid(pairwise_diff), logits=pairwise_logits))

    return loss

In [43]:
model.compile(optimizer='adam', loss=ranknet_loss)

In [25]:
with open('/content/drive/MyDrive/torob/Features/doc_tfidf.h5', 'rb') as f:
    doc = pickle.load(f)

with open('/content/drive/MyDrive/torob/Features/train_query_tfidf.h5', 'rb') as f:
    query = pickle.load(f)


In [44]:
dataset_size = getDatasetSize(aggregated_searches_df)
vectorsize = doc.shape[1]
batch_size = 8192

In [45]:
train_generator = DualChannelDataGenerator( 
      dataset_size ,
      query,
      doc, 
      aggregated_searches_df , 
      vectorsize ,
      batch_size
      )

In [None]:
model.fit(train_generator, steps_per_epoch=len(train_generator), epochs=5,callbacks=[checkpoint])

Epoch 1/5
Epoch 2/5
Epoch 3/5
  34/2930 [..............................] - ETA: 3:24:25 - loss: 0.6931