In [1]:
#Import dependancies
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') #Encoder text to tensor
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id) #load the model, using the smallest model for this prototype


  from .autonotebook import tqdm as notebook_tqdm
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [2]:
#Get Yelp dataset and standardise

reviews = pd.read_csv('reviews.csv')
reviews = shuffle(reviews)

#Standardise and tokenize
for column in reviews:
    reviews['text'] = reviews['text'].str.lower()   #Covert the text to lower case
    reviews['text'].str.replace('[^\w\s]','') #Remove punctuation
    reviews['text'].str.strip() #Remove whitespace
    reviews['text'].str.replace("\n", " ") #Remove escape characters
reviews = reviews['text']

#Use only a slice of the data 25k reviews
reviews = reviews[:25000]

print(reviews)

  reviews['text'].str.replace('[^\w\s]','') #Remove punctuation


47739    excellent property if you are in the oldsmar, ...
44739    so glad we found this place. it has everything...
14980    so i've tried this place a couple times before...
71427    luke's is awesome! if your looking for some pl...
66821    i love the grilled chicken parmesan salad at t...
                               ...                        
72868    i recently moved here from santa monica, where...
4104     really, more like 3.5 stars. very clean, light...
9021     there's honestly nothing wrong with famous dav...
3112     i can't wait to go back. it's unbelievably ful...
60970    our first stay wound up to be a debacle, mainl...
Name: text, Length: 25000, dtype: object


  reviews = reviews[:25000]


In [3]:
#turn the entire 25000 reviews into a long string to be segmented in next sequence
single_string = ''
for row in reviews:
  x = row
  single_string += x 
string_tokenized = tokenizer.encode(single_string)

Token indices sequence length is longer than the specified maximum sequence length for this model (3230940 > 1024). Running this sequence through the model will result in indexing errors


In [4]:
# Define variables to store the data
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000

# Split the string_tokenized list into blocks of size block_size
# and store each block in the examples list
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])

# Initialize empty lists for inputs and labels
inputs, labels = [], []

# For each example in the examples list,
# store the input as the example without the last element
# and store the label as the example without the first element
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])

# Create a dataset from the inputs and labels tensors
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))

# Shuffle the dataset and batch it
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [5]:
# defining optimizer
#optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer='adam', loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

Running model with 25000 reviews, 100 block size, takes around 8:30 min per epoch.

In [6]:
history = model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

In [14]:
text = "For what it is"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 150,
  num_beams = 5,
  temperature = 0.8,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

For dinner, my wife and i split an appetizer and two entrees. we split the crab cakes and macaroni and cheese. both were very good. they were fresh and had a nice balance of flavor. the macarons were also very tasty. i would definitely come back for more. 
the service was great. our waiter was very attentive and took the time to check on us and make sure we were enjoying the food we ordered. it was a very pleasant experience and we look forward to trying everything else on their menu!i love this place! i've been going here for years and it's one of my favorite places in town! the staff is always friendly and the prices are very reasonable! they have a wide


In [12]:
text = "The restaurant"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 150,
  num_beams = 5,
  temperature = 0.85,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

The best part was the service. we were seated quickly and the place was clean. the staff was friendly and attentive. i would highly recommend this restaurant.i love this place! i've been coming here for years and it's always a good experience. they have a great variety of sandwiches, pastas, and pastries. my favorite is the sweet potato pastrami and my all time favorite the chicken and waffles. 
the only downside i can think of is that they don't always have happy hour so make sure you go during the week in order to save a few bucks. but i will definitely come back for sure!i'm a big fan of la colombe burrito and this is by far the best bur


In [15]:
text = "I think they're"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 100,
  num_beams = 5,
  temperature = 0.85,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

Every time we go, we are greeted with a warm smile and a knowing nod from our waiter.  it is a great way to give back to our community and show that we care about our food and are not afraid to try something new.i love this place. i've been going here for years and i can't say enough good things about the staff. they are always friendly and the food is always great. the only downside is the price. it's a little expensive for what you get


In [16]:
text = "I give up on this place"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 125,
  num_beams = 5,
  temperature = 0.92,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

Their menu changes all the time, so it's hard to know what to expect.  i'm not sure if we'll be back next year, but we will definitely stop in for the oysters and the crawfish bisque.my husband and i have been going to this place since we were little. we love it. the staff is always friendly and helpful. they have a wide variety of specialty pizzas to choose from. i always get the calabacitas and they are delicious. it is a little pricey for what you get but the portions are very generous. if you are looking for something different then this


In [None]:
text = "Unfortunately,"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 150,
  num_beams = 5,
  temperature = 0.8,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))