In [1]:
#Import dependancies
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') #Encoder text to tensor
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id) #load the model, using the smallest model for this prototype


  from .autonotebook import tqdm as notebook_tqdm
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [2]:
#Get Yelp dataset and standardise

reviews = pd.read_csv('reviews.csv')
reviews = shuffle(reviews)

#Standardise and tokenize
for column in reviews:
    reviews['text'] = reviews['text'].str.lower()   #Covert the text to lower case
    reviews['text'].str.replace('[^\w\s]','') #Remove punctuation
    reviews['text'].str.strip() #Remove whitespace
    reviews['text'].str.replace("\n", " ") #Remove escape characters
reviews = reviews['text']

#Use only a slice of the data 25k reviews
reviews = reviews[:25000]

print(reviews)

  reviews['text'].str.replace('[^\w\s]','') #Remove punctuation


47739    excellent property if you are in the oldsmar, ...
44739    so glad we found this place. it has everything...
14980    so i've tried this place a couple times before...
71427    luke's is awesome! if your looking for some pl...
66821    i love the grilled chicken parmesan salad at t...
                               ...                        
72868    i recently moved here from santa monica, where...
4104     really, more like 3.5 stars. very clean, light...
9021     there's honestly nothing wrong with famous dav...
3112     i can't wait to go back. it's unbelievably ful...
60970    our first stay wound up to be a debacle, mainl...
Name: text, Length: 25000, dtype: object


  reviews = reviews[:25000]


In [3]:
#turn the entire 25000 reviews into a long string to be segmented in next sequence
single_string = ''
for row in reviews:
  x = row
  single_string += x 
string_tokenized = tokenizer.encode(single_string)

Token indices sequence length is longer than the specified maximum sequence length for this model (3230940 > 1024). Running this sequence through the model will result in indexing errors


In [4]:
# Define variables to store the data
examples = []
block_size = 100
BATCH_SIZE = 12
BUFFER_SIZE = 1000

# Split the string_tokenized list into blocks of size block_size
# and store each block in the examples list
for i in range(0, len(string_tokenized) - block_size + 1, block_size):
  examples.append(string_tokenized[i:i + block_size])

# Initialize empty lists for inputs and labels
inputs, labels = [], []

# For each example in the examples list,
# store the input as the example without the last element
# and store the label as the example without the first element
for ex in examples:
  inputs.append(ex[:-1])
  labels.append(ex[1:])

# Create a dataset from the inputs and labels tensors
dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))

# Shuffle the dataset and batch it
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [5]:
# defining optimizer
#optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
# definining loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# compiling the model
model.compile(optimizer='adam', loss=[loss], metrics=[metric])

Running model with 25000 reviews, 100 block size, takes around 8:30 min per epoch.

In [6]:
history = model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
model.save('saved_model_gpt2')



INFO:tensorflow:Assets written to: saved_model_gpt2\assets


INFO:tensorflow:Assets written to: saved_model_gpt2\assets


In [7]:
text = "For what it is"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 150,
  num_beams = 5,
  temperature = 0.8,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

For what it is; worth driving an hour out of my way for.i'm a local renter in the area and saw this place through a yelp article ranking the top indian restaurants in pinnellas. it wasn't bad but it was nothing special. i had the roti (which is a rotisserie but with a tinge of spiny melon ice cream and a good amount of spice), and the peanut pancake which was served on a bed of naan bread was the most amazingly savory and had a delicious breakfast-like syrup and peanut buttery egg white chocolatey and honey sauce. my daughter enjoyed the fried rice.  and banana pudding even though i could not a touch of which she could


In [18]:
text = "The restaurant"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 70,
  num_beams = 5,
  temperature = 0.7,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

The restaurant. the good: both times service was great and food is always delicious. they even have a new item (came in for chicken) which was a really nice addition. 
the bad: tiny portions, very watery atmosphere, expensive for what it is. will not return.best chicken and waffles i have ever eaten


In [15]:
text = "I think they're"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 100,
  num_beams = 5,
  temperature = 0.75,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

I think they're trying too hard to be authentic and not trying to sell me stuff i don't need. i'm just glad they didn't say so (as it's probably pretty likely to stop being sold out that anything i bought is probably not worth what it is.)people of the nwc skyline that most people would like to see.  they had a great selection of beer from the vodkas (though i've never been there), and there were a few food items as well


In [14]:
text = "I give up on this place"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 100,
  num_beams = 5,
  temperature = 0.75,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

I give up on this place (the other day i went to pick up the phone and spent a few hours there) and was told i would need to make an appointment for the next day.  as soon as i got there (a little later than i thought it would be) the young lady was unapologetic, rude, pushy, impatient, and rolled her eyes on more than one occassion that it was impossible for both of us and she didn't tell me about it either.


In [13]:
text = "Unfortunately,"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 150,
  num_beams = 5,
  temperature = 0.75,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

Unfortunately, but it was made with dry aged ground beef with tendons and tenders in a garlic broth. i'm used to good korean food, and this was no exception. if you like lots of choices, the staff will make sure you have what you need and the menu's ready for you. the food came out in 10 minutes and was hot and delicious! i ordered the miso soup to start and it had just the right amount of kick to it! very hearty and hearty with a hearty sized bowl of chicken and veggies throughout the broth served with very deliciously spicy butternut of the best beji de-healthy flavor toasted rice and teriyaki sauce they've had (compably spicy edmonton, not even more


In [19]:
text = "i found the"
# encoding the input text
input_ids = tokenizer.encode(text, return_tensors='tf')
# getting out output
output = model.generate(
  input_ids,
  max_length = 150,
  num_beams = 5,
  temperature = 0.75,
  no_repeat_ngram_size=2,
  num_return_sequences=5
)

print(tokenizer.decode(output[0]))

i found the ladies and gents to be very unfriendly and unhelpful. i have been going to other department stores for years and this one was by far the best.this review is for the lunch buffet:
i have no complaints about this restaurant and as far as the food, it certainly did not disappoint.  i ordered the mexican chicken dinner platter and while the chicken was very tasty, the portions were small and the service was absolutely terrible. when i first saw the billowing out of the $45 for dinner, i asked about the waitress, she asked if i could tell her that it was a complimentary soft drinks included gratuity was between us, which completely made no touch of like a sorry but not
