In [0]:
!git clone https://github.com/huggingface/transformers.git
!cd transformers; pip install .

fatal: destination path 'transformers' already exists and is not an empty directory.
Processing /content/transformers
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-2.3.0-cp36-none-any.whl size=467316 sha256=92c1483cfc2a8c4d868cd375489ca5e6b7f8f8c5f281efccaf64a243702325e4
  Stored in directory: /tmp/pip-ephem-wheel-cache-2q1771oe/wheels/23/19/dd/2561a4e47240cf6b307729d58e56f8077dd0c698f5992216cf
Successfully built transformers
Installing collected packages: transformers
  Found existing installation: transformers 2.3.0
    Uninstalling transformers-2.3.0:
      Successfully uninstalled transformers-2.3.0
Successfully installed transformers-2.3.0


In [0]:
import transformers
transformers.__version__

'2.3.0'

In [0]:
import logging
logging.getLogger().setLevel(logging.CRITICAL)

from typing import List
from tqdm import tqdm_notebook

import torch
import numpy as np
from transformers import CTRLTokenizer, CTRLLMHeadModel, CTRLConfig

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [0]:
tokenizer = CTRLTokenizer.from_pretrained('ctrl')
model = CTRLLMHeadModel.from_pretrained('ctrl').to(device)

In [0]:
def clean_text(text: str, review_star:str = '5.0') -> List[str]:
  input_context = f'Reviews Rating: {review_star}'
  input_context2 = input_context[8:]
  res = text.replace(f'{input_context} \n ','').split(f' \n {input_context2} \n ') #clean out multiple reviews
  res[-1] = '.'.join(res[-1].split('.')[:-1]) + '.' #clean out unfinished sentences
  #if last review only contains an unfinished sentence, skip it
  if res[-1] == '.':
    res = res[:-1]
  return res

def generate_text(review_star: str = '5.0', num_return_sequences = 50) -> str:
  input_context = f'Reviews Rating: {review_star}'
  input_context2 = input_context[8:]
  input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0).to(device)  # encode input context
  outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, num_beams=1, temperature=1., 
                           top_k=50, top_p=1., repetition_penalty=1.2, length_penalty=1., 
                           eos_token_ids = tokenizer.encode('|'), num_return_sequences=num_return_sequences) 
  texts = [tokenizer.decode(i, skip_special_tokens=True) for i in outputs.squeeze()]
  cleaned_texts = []
  for text in texts:
    cleaned_texts += clean_text(text,review_star)
  return cleaned_texts

In [0]:
%%time
x = generate_text('1.0', 60)

CPU times: user 29.1 s, sys: 2.09 s, total: 31.2 s
Wall time: 29.7 s


In [0]:
review_star = '1.0'
results = []
for i in tqdm_notebook(range(1000)):
  results += generate_text(review_star, 60)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [0]:
len(results)

811

In [0]:
import pandas as pd

fname = f'fake_reviews_{review_star}.csv'
results_df = pd.DataFrame(results)
results_df.columns = ['text']
results_df.to_csv(fname, index=False)
print(results_df.shape)
!ls

(811, 1)
fake_reviews_1.0.csv  sample_data  transformers


In [0]:
results_df.head()

Unnamed: 0,text
0,The item was in very poor condition and has a ...
1,My husband bought this gift for me back when w...
2,I hate to diss this camera because it takes th...
3,I ordered the item in July and it did not work...
4,This toy really disappointed my son and me. I ...


In [0]:
from google.colab import files
files.download(fname)