# Sample Steam Reviews with GPT-2
Code inspired from https://github.com/woctezuma/sample-steam-reviews-with-gpt-2

## Setting the GPT-2 model

Install the Python package

Reference: https://github.com/minimaxir/gpt-2-simple

In [0]:
!pip install gpt_2_simple

Download the pre-trained model

In [0]:
import gpt_2_simple as gpt2
from datetime import datetime
from google.colab import files

## Downloading GPT-2

Choose between `117M` and `345M` models

In [0]:
# model_name = '117M'
model_name = '345M'

Download

In [0]:
gpt2.download_gpt2(model_name=model_name)

## Uploading a Text File to be Trained to Colaboratory

### Either get the data by yourself

In [0]:
!curl -O https://raw.githubusercontent.com/woctezuma/sample-steam-reviews-with-gpt-2/master/export_review_data.py

In [0]:
!curl -O https://raw.githubusercontent.com/woctezuma/sample-steam-reviews-with-gpt-2/master/requirements.txt

In [0]:
!pip install -r requirements.txt

In [0]:
app_id = 583950 # Artifact: 583950

num_days = 28*3 # slightly less than 3 months
# num_days = -1 # if negative, then no time limit

In [0]:
from export_review_data import apply_workflow_for_app_id

apply_workflow_for_app_id(app_id,
                          num_days=num_days)

### Or get a data snapshot from me

Currently only possible for Artifact, as an example, because the recommended way is to run the code above for the game of your choice instead.

In [0]:
!mkdir -p data/

## Either Artifact (only the recent English reviews):
# !curl -O https://raw.githubusercontent.com/woctezuma/sample-steam-reviews-with-gpt-2/master/data/with_delimiters/583950.txt
# !mv 583950.txt data/

## Or Crusader Kings II (all the English reviews):
# !curl -O https://raw.githubusercontent.com/wiki/woctezuma/sample-steam-reviews-with-gpt-2/data/with_delimiters/203770.txt
# !mv 203770.txt data/

## Finetune GPT-2

In [0]:
file_name = 'data/' + str(app_id) + '.txt'

run_name = model_name + '_reviews_' + str(app_id)

In [0]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              run_name=run_name,
              dataset=file_name,
              model_name=model_name,
              steps=1000,
              restore_from='fresh', # change to 'latest' to resume training
              print_every=10,       # how many steps between printing progress
              sample_every=200,     # how many steps to print a demo sample
              save_every=500        # how many steps between saving checkpoint              
              )

## Save a Trained Model Checkpoint

In [0]:
# gpt2.mount_gdrive()

In [0]:
# gpt2.copy_checkpoint_to_gdrive(run_name=run_name)

## Load a Trained Model Checkpoint

In [0]:
# gpt2.mount_gdrive()

In [0]:
# gpt2.copy_checkpoint_from_gdrive(run_name=run_name)

## Generate Text From The Trained Model

In [0]:
temperature=1.0 # Default is 0.7, but you may want to increase the temperature, especially if your dataset is small, to avoid copying text.
top_k = 40      # Default: 0   ; Recommended: 40  ; useless parameter if top_p > 0.0
top_p = 0.9     # Default: 0.0 ; Recommended: 0.9 ; no need for top_k if top_p > 0.0

In [0]:
num_samples = 3
num_batches = 3 # Unique to GPT-2, you can pass a batch_size to generate multiple samples in parallel, giving a massive speedup.

In [0]:
gen_texts_A = gpt2.generate(sess,
              run_name=run_name,
              nsamples=num_samples,
              batch_size=num_batches,              
              temperature=temperature,
              top_k=top_k,
              top_p=top_p,
              return_as_list=True)

print('\n\n--- SEPARATOR ---\n\n'.join(gen_texts_A))

In [0]:
gen_texts_B = gpt2.generate(sess,
              run_name=run_name,
              nsamples=num_samples,
              batch_size=num_batches,
              temperature=temperature,
              top_k=top_k,
              top_p=top_p,                            
              prefix='<|startoftext|>I love',
              truncate='<|endoftext|>',
              return_as_list=True)

print('\n\n--- SEPARATOR ---\n\n'.join(gen_texts_B))

In [0]:
gen_texts_C = gpt2.generate(sess,
              run_name=run_name,
              nsamples=num_samples,
              batch_size=num_batches,
              temperature=temperature,
              top_k=top_k,
              top_p=top_p,                            
              prefix='<|startoftext|>I hate',
              truncate='<|endoftext|>',
              return_as_list=True)

print('\n\n--- SEPARATOR ---\n\n'.join(gen_texts_C))

In [0]:
gen_texts_D = gpt2.generate(sess,
              run_name=run_name,
              nsamples=num_samples,
              batch_size=num_batches,
              temperature=temperature,
              top_k=top_k,
              top_p=top_p,                            
              prefix='<|startoftext|>Please',
              truncate='<|endoftext|>',
              return_as_list=True)

print('\n\n--- SEPARATOR ---\n\n'.join(gen_texts_D))

In [0]:
gen_texts_E = gpt2.generate(sess,
              run_name=run_name,
              nsamples=num_samples,
              batch_size=num_batches,
              temperature=temperature,
              top_k=top_k,
              top_p=top_p,                            
              prefix='<|startoftext|>This game has near infinite replay value',
              truncate='<|endoftext|>',
              return_as_list=True)

print('\n\n--- SEPARATOR ---\n\n'.join(gen_texts_E))

## Copy the Generated Text to Google Drive

In [0]:
temperature_suffixe = '_temperature_' + str(temperature)

In [0]:
if top_p > 0.0:
  file_name_suffixe = temperature_suffixe + '_top_p_' + str(top_p)
elif top_k > 0:
  file_name_suffixe = temperature_suffixe + '_top_k_' + str(top_k)
else:
  file_name_suffixe = temperature_suffixe

In [0]:
output_file_name = 'output_' + str(app_id) + file_name_suffixe + '.md'

print(output_file_name)

In [0]:
with open(output_file_name, 'w') as f:
  
  f.write('## Game\n\n')
  f.write('[<img alt="game name" src="https://steamcdn-a.akamaihd.net/steam/apps/{}/header.jpg" width="150">](https://store.steampowered.com/app/{})\n\n'.format(app_id, app_id))
  
  f.write('## Reviews generated unconditionally\n\n')
  for (i, gen_text) in enumerate(gen_texts_A):
    f.write('{}.\n\n'.format(i+1))
    f.write('> {}\n\n'.format(gen_text))
    
  f.write('## Reviews starting with I love\n\n')
  for (i, gen_text) in enumerate(gen_texts_B):
    f.write('{}.\n\n'.format(i+1))
    f.write('> {}\n\n'.format(gen_text))
    
  f.write('## Reviews starting with I hate\n\n')    
  for (i, gen_text) in enumerate(gen_texts_C):
    f.write('{}.\n\n'.format(i+1))
    f.write('> {}\n\n'.format(gen_text))
  
  f.write('## Reviews starting with Please\n\n')  
  for (i, gen_text) in enumerate(gen_texts_D):
    f.write('{}.\n\n'.format(i+1))
    f.write('> {}\n\n'.format(gen_text))
  
  f.write('## Reviews starting with This game has near infinite replay value\n\n')  
  for (i, gen_text) in enumerate(gen_texts_E):
    f.write('{}.\n\n'.format(i+1))
    f.write('> {}\n\n'.format(gen_text))
   

In [0]:
gpt2.mount_gdrive()

In [0]:
import shutil

shutil.copyfile(output_file_name, '/content/drive/My Drive/' + output_file_name)