In [1]:
import tensorflow as tf
import numpy as np
dataset = tf.data.TextLineDataset(
    filenames='/home/fkovalev/reviews_filtered.txt', compression_type=None, buffer_size=None, num_parallel_reads=None
)
dataset = dataset.take(1000000)

In [2]:
def splitter(line):
    segs = tf.strings.split(line)
    return {'review_id':segs[0], 'text':segs[-1]}

In [3]:
splitted_dataset = dataset.map(
    lambda line: splitter(line),
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)

In [37]:
batch_size = 2048
batches = splitted_dataset.batch(batch_size)

In [5]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [20]:
def inference(batch):
    review_ids = batch['review_id']
    embeddings = embed(batch['text'])
    original_text = batch['text']
    return {'review_id': review_ids, 'text': original_text,'use_vector': embeddings}

In [38]:
results = batches.map(
    lambda batch: inference(batch),
    num_parallel_calls=48,
    deterministic=False
)
results

<ParallelMapDataset shapes: {review_id: (None,), text: (None,), use_vector: (None, 512)}, types: {review_id: tf.string, text: tf.string, use_vector: tf.float32}>

In [23]:
for element in dataset.take(6):
    print(element)

tf.Tensor(b'3872002725\tpositive\tPerfect plan B for our beach wedding that we couldn\xe2\x80\x99t have.', shape=(), dtype=string)
tf.Tensor(b'3872002725\tpositive\tStaff were AMAZING - so friendly and accommodating.', shape=(), dtype=string)
tf.Tensor(b'3872006069\tpositive\tAll apartments with a pool view Ten minute walk to the centre of town, very friendly, helpful and flexible staff and good clean rooms and comfy beds', shape=(), dtype=string)
tf.Tensor(b'3872009990\tpositive\tWe had a 2 bedroom suite with a hot tub.', shape=(), dtype=string)
tf.Tensor(b'3872009990\tpositive\tVery private setting and great atmosphere.', shape=(), dtype=string)
tf.Tensor(b'3872013970\tpositive\tOverall the room was very clean, as were the bathrooms.', shape=(), dtype=string)


In [16]:
query = ["Staff were AMAZING - so friendly and accommodating."]
query_vec = embed(query) #encodes the string in the list
query_vec.shape

TensorShape([1, 512])

In [35]:
results.take(1)

<TakeDataset shapes: {review_id: (None,), text: (None,), use_vector: (None, 512)}, types: {review_id: tf.string, text: tf.string, use_vector: tf.float32}>

In [36]:
for r in results.take(1):
    #vectors = r['use_vector'].numpy()
    print('review_ids: ',r['review_id'].numpy()[...,None])
    print('text: ', r['text'].numpy()[...,None])

review_ids:  [[b'3872002725']
 [b'3872002725']
 [b'3872006069']
 [b'3872009990']
 [b'3872009990']]
text:  [[b'have.']
 [b'accommodating.']
 [b'beds']
 [b'tub.']
 [b'atmosphere.']]


In [39]:
correlation_table = []
top_k = 5
for result in results:
    review_ids = result['review_id'].numpy()[...,None]
    vectors = result['use_vector'].numpy()
    review = result['text'].numpy()
    correlation = np.transpose(np.inner(query_vec,vectors))
    correlation_table.append(correlation)
score_table = zip(review_ids, correlation_table)
top_results = sorted(score_table, key= itemgetter(1), reverse = True)[0:top_k]

NameError: name 'itemgetter' is not defined

In [None]:
import csv
import numpy as np
import time

with open('/yueyang/reviews_filtered_use.emb', 'a', newline='') as f:
    writer = csv.writer(f)
    step = 0
    st_time = time.time()
    for result in results:
        if (step+1)%100 == 0:
            ed_time = time.time()
            print('{} s/step'.format((ed_time-st_time)/100))
            st_time = ed_time
        bytes_2_str = lambda x:x.decode("utf-8") 
        review_ids = result['review_id'].numpy()[...,None]
        vectors = result['use_vector'].numpy()
        concat = np.concatenate((review_ids, vectors), axis=1)
        writer.writerows(concat)
        step += 1
        if step == 200:
            break