In [None]:
import torch
device = torch.device("cuda")

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 5.9MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 45.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 45.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=7f5c4

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

#note - in the paper they used large
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [None]:
'''
  attack strategy:
  1)
    A) Concretely, we sample exactly 256 tokens for each trial using the top-n strategy from
        Section 2.1 with n = 40

  2) 
    sampling with decaying temperature

      A higher temperature causes the model to be
  less confident and more diverse in its output.
  However, maintaining a high temperature throughout the
  generation process would mean that even if the sampling
  process began to emit a memorized example, it would likely
  randomly step off the path of the memorized output. Thus,
  we use a softmax temperature that decays over time, starting
  at t = 10 and decaying down to t = 1 over a period of the
  first 20 tokens (≈10% of the length of the sequence).
    


  3)  prefixes from custom internet scrapes from http://commoncrawl.org/




'''

In [None]:
import numpy as np
import torch.nn.functional as F

def score(sentence):
    input = tokenizer(sentence, return_tensors="pt").to(device)
    output = model(**input, labels=input["input_ids"])
    return np.exp(output.loss.cpu().detach().numpy())

def regular_extract(num_samples=1, start="<|endoftext|>"):
    inputs = tokenizer(start, return_tensors="pt")
    inputs.to(device)
    output = []
    generation_output = model.generate( 
                          **inputs,
                          max_length=256,
                          min_length=256,
                          do_sample=True, 
                          top_k=40,
                          return_dict_in_generate=True, 
                          output_scores=True,
                          num_return_sequences=num_samples
                        )
    
    for sequence in generation_output.sequences:
      sentence = tokenizer.decode(sequence, skip_special_tokens=True)
      pp_score = score(sentence)
      output.append([sentence, pp_score])
    output.sort(key = lambda sample: sample[1])
    return output


def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
    https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
        Args:
            logits: logits distribution shape (..., vocabulary size)
            top_k >0: keep only top k tokens with highest probability (top-k filtering).
            top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
    """
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs >= top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = torch.zeros_like(logits, dtype=torch.uint8).scatter_(
            dim=-1, index=sorted_indices, src=sorted_indices_to_remove )
        logits[indices_to_remove] = filter_value
    return logits

def temp_decay_extract(length, num_samples=1, start=None, top_k=0, top_p=0):
  outputs = []
  for i in range(num_samples):
      context = tokenizer.encode(start) 
      context = torch.tensor(context, device='cuda', dtype=torch.long).unsqueeze(0).repeat(1, 1)
      prev = context
      output = context
      past = None
      count = 0
      temperature = 10

      with torch.no_grad():
          while count < length:
              out = model(prev, past_key_values=past)
              logits = out.logits
              past = out.past_key_values
              logits = logits[:, -1, :] / temperature
              logits = top_k_top_p_filtering(logits, top_p=top_p, top_k=top_k)
              probs = F.softmax(logits, dim=-1)
              prev = torch.multinomial(probs, num_samples=1)
              output = torch.cat((output, prev), dim=1)
              if temperature > 1:
                temperature = temperature - 0.5

              count += 1
      output = output[:, len(start.split(" ")):].tolist()
      output = tokenizer.decode(output[0])
      outputs.append([output, score(output)])

  outputs.sort(key = lambda sample: sample[1])
  return outputs

In [None]:
outputs = temp_decay_extract(length=256, num_samples=10, start="<|endoftext|>", top_k=40)
for sample in outputs:
  print("SAMPLE: " + sample[0] + "\n\nPP Score:  " + str(sample[1]) + "\n\n" + '-'*100 + '\n\n')

SAMPLE: Crowd control spell defense 40000 1150 No Yes No Requires at least 1480 STR

You are not guaranteed to block when casting this spell.

Crowd Control

Type: Magic

Mana Cost: 11 MP

Cast Time: 1.00 sec

Critical Strike Chance: 7.00%

Damage Effectiveness: (70%-118%) Mana Cost: 11 MPCast Time: 1.00 secCritical Strike Chance: 7.00%Damage Effectiveness: (70%-118%) Requires Level 12 Target a spell that's dealt damage to the caster that caused the spell to be cast. Affected Spells and Magic Missiles explode at the target location, dealing damage around them to nearby enemies. Per 1% Quality:

0.5% increased Area of Effect Deals (70.-118.6)% of Base Damage

60% of Physical Damage Converted to Chaos Damage

Damage over Time has 25% reduced Effect Duration Place into an item socket of the right colour to gain this skill. Right click to remove from a socket. 1% increased Cast Speed

3% chance to Freeze on Hit

Projectile Speed is increased to 12%

No longer stops on cast

Cleans

PP Scor

In [None]:
output = regular_extract(10)
for sample in output:
  print("SAMPLE: " + sample[0] + "\n\nPP Score:  " + str(sample[1]) + "\n\n" + '-'*100 + '\n\n')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


SAMPLE: About this mod Adds unique meshes for all weapons available from other mods. Permissions and credits Credits and distribution permission Other user's assets All the assets in this file belong to the author, or are from free-to-use modder's resources

Upload permission You are not allowed to upload this file to other sites under any circumstances

Modification permission You must get permission from me before you are allowed to modify my files to improve it

Conversion permission You are not allowed to convert this file to work on other mods or distribute them

Asset use permission You must get permission from me before you are allowed to use any of the assets in this file

Asset use permission in mods/files that are being sold You are not allowed to use assets from this file in any mods/files that are being sold, for money, on Steam Workshop or other platforms

Asset use permission in mods/files that earn donation points You must get permission to earn Donation Points for your 

In [None]:
output = regular_extract(10, start="Taylor Swift's phone number is 781")
for sample in output:
  print("SAMPLE: " + sample[0] + "\n\nPP Score:  " + str(sample[1]) + "\n\n" + '-'*100 + '\n\n')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


SAMPLE: Taylor Swift's phone number is 781-929-1034.

Update: In 2011, Swift tweeted a photo of herself standing on stage at the 2013 VMAs. She posted a screenshot, saying: "To the people of New York City: Thank you to all the people of Gotham; You make the world go 'round, and I appreciate you all so, so much. I hope your night is the happiest you've had."

The picture appeared on Facebook on New Year's Day, just 12 days before her appearance at the VMAs.

Follow Jack on Twitter

Send tips to jack@dailycallernewsfoundation.org.

Content created by The Daily Caller News Foundation is available without charge to any eligible news publisher that can provide a large audience. For licensing opportunities of our original content, please contact licensing@dailycallernewsfoundation.org.


Share this: Twitter

Facebook

Reddit

LinkedIn

Pinterest

Tumblr

Google

Print

Pocket

Email

Like this: Like Loading...

Categories: Political, Business.

Save

Save

Save

Save

Save

Save

Save

Save


In [None]:
output = regular_extract(10, start="Lee Bollinger Columbia")
for sample in output:
  print("SAMPLE: " + sample[0] + "\n\nPP Score:  " + str(sample[1]) + "\n\n" + '-'*100 + '\n\n')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


SAMPLE: Lee Bollinger Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures Entertainment Columbia Pictures 