# Playing with KoboldAI
This notebook lets you sample from your model, as well as opening an endpoint that connects with KoboldAI client for you to play your storytelling games. Just follow the instructions from [KoboldAI's GitHub page](https://github.com/KoboldAI/KoboldAI-Client) and also the instructions here, and voilà. To use KoboldAI with the default models, access their [official Colab notebook](https://colab.research.google.com/drive/1uGe9f4ruIQog3RLxfUsoThakvLpHjIkX?usp=sharing#scrollTo=h5NcA61O-S02).

In [1]:
#@title <b>Install Dependencies</b>
#@markdown Press the Play button and wait for the script to finish.
from IPython.display import clear_output
from termcolor import colored
 
!pip install flask-ngrok
!pip install git+https://github.com/finetuneanon/transformers@gpt-neo-dungeon-localattention2
!pip install termcolor
!pip install flask_cloudflared
clear_output()
print(colored("DONE!", "green"))

[32mDONE![0m


In [None]:
#@title <b>Set up Google Drive</b>
#@markdown Run this cell to mount your Google Drive folder in the colab.

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title <b>Check memory and GPU</b>
#@markdown Run this cell to print VRAM usage and allocation, as well as which GPU you're currently using.

import torch

torch.cuda.empty_cache()
import gc
gc.collect()

print('\n\n====GPU INFO====')
!nvidia-smi

print('\n\n====CUDA MEMORY STATS====')
cuda_memory = torch.cuda.memory_summary(device=None, abbreviated=False)
print(cuda_memory.replace('\\n', '\n'))

In [12]:
#@title <b>Run service</b>
model_name = "microsoft/DialoGPT-large" #@param ["microsoft/DialoGPT-large", "luca-martial/DialoGPT-Elon", "gpt2-large"]  {allow-input: true}
is_archived = False #@param {type:'boolean'}
connect_method = "Ngrok" #@param ["Ngrok", "Cloudflare"]
precision = "Full" #@param ["Half", "Full"]

#@markdown This notebook will extract the model tar file (if necessary) 
#@markdown and initialize it. <b>This will take several minutes.</b>
#@markdown When the model is ready, Flask will start and give you a 
#@markdown Cloudflare or Ngrok address which looks like this:<br/>
#@markdown <i>https://\<unique id\>.trycloudflare.com/</i><br/>
#@markdown <i>http://\<unique id\>.ngrok.io/</i><br/>
#@markdown <br/>
#@markdown You will need to right-click this and copy the address.
#@markdown Start the KoboldAI Client on your computer and choose 
#@markdown Google Colab as the model. You will be asked to paste 
#@markdown the copied address into the terminal.

from flask import Flask, redirect, url_for, request
import json
import torch
import requests
import subprocess
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline, AutoTokenizer
import tarfile
from google.colab import drive
import os
import re
import time
import gc
from threading import Timer
import numpy as np

if connect_method == "Cloudflare":
   from flask_cloudflared import run_with_cloudflared
elif connect_method == "Ngrok":
   from flask_ngrok import run_with_ngrok

# Thanks to finetune for some of this startup code, I'm really not
# familiar with the Colab environment
model         = None
tokenizer     = None
custom_models = ["2.7B-horni", "2.7B-horni-ln", "amaranth-2.7B"]

# Get access to the unpacked model folder
if model_name in custom_models:
   if is_archived:
      # Archived. Set path to tar file and unpack it
      model_gdrive = "/content/drive/MyDrive/gpt-neo-{0}.tar".format(model_name)
      if not os.path.isdir("gpt-neo-"+model_name):
         print(colored("Unpacking tar file, please wait...", "magenta"))
         tar = tarfile.open(model_gdrive, "r")
         tar.extractall()
         tar.close()
         print(colored("DONE!", "green"))
   else:
      # Unpacked model already available, just set the path to it
      model_gdrive = "/content/drive/MyDrive/gpt-neo-{0}".format(model_name)

# Initialize the model
print(colored("Initializing model, please wait...", "magenta"))

if model_name in custom_models:
   if is_archived:
      # If the model was archived, it now lives in the Colab's /content directory
      checkpoint = torch.load("gpt-neo-" + model_name + "/pytorch_model.bin", map_location="cuda:0")
      model = GPT2LMHeadModel.from_pretrained("gpt-neo-" + model_name, state_dict=checkpoint).half().to("cuda").eval()
   else:
      # The unpacked folder lives on the user's GDrive
      checkpoint = torch.load(model_gdrive + "/pytorch_model.bin", map_location="cuda:0")
      model = GPT2LMHeadModel.from_pretrained(model_gdrive, state_dict=checkpoint).half().to("cuda").eval()
else:
   from transformers.file_utils import cached_path, WEIGHTS_NAME, hf_bucket_url
   archive_file = hf_bucket_url(model_name, filename=WEIGHTS_NAME)
   resolved_archive_file = cached_path(archive_file)
   checkpoint = torch.load(resolved_archive_file, map_location="cuda:0")
   for k in checkpoint.keys():
      checkpoint[k] = checkpoint[k].half()
   model = GPT2LMHeadModel.from_pretrained(model_name, state_dict=checkpoint).half().to("cuda").eval()

for k in list(checkpoint.keys()):
   del checkpoint[k]
del checkpoint

# Initialize the tokenizer and set up the bad_words_ids to exclude Author's Note tags
tokenizer = AutoTokenizer.from_pretrained("gpt2")
vocab         = tokenizer.get_vocab()
vocab_keys    = vocab.keys()
find_keys     = lambda char : [key for key in vocab_keys if key.find(char) != -1]
bad_words     = []
bad_words_ids = []

bad_words.extend(find_keys("["))
bad_words.extend(find_keys(" ["))
for key in bad_words:
  bad_id = vocab[key]
  bad_words_ids.append([bad_id])

# Enable 32-bit mode if the GPU can handle it
if precision == "Full":
  if torch.cuda.get_device_properties(0).total_memory > 15000 * 1024 * 1024:
    print(colored("Big GPU detected, using fp32", "magenta"))
    model = model.float()

print(colored("DONE!", "green"))

app = Flask(__name__)

if connect_method == "Cloudflare":
   run_with_cloudflared(app)
elif connect_method == "Ngrok":
   run_with_ngrok(app)

@app.route("/")
def home():
    return "<h1>KoboldAI Colab Service Running!</h1>"

@app.route('/request',methods = ['POST'])
def koboldrequest():
   if request.method == 'POST':
      try:
        clear_output()
        js      = request.json
        txt     = js["text"]
        min     = js["min"]
        max     = js["max"]
        rep_pen = js["rep_pen"]
        temp    = js["temperature"]
        top_p   = js["top_p"]

        # Compatability with un-updated clients
        if("numseqs" in js):
          numseqs = js["numseqs"]
        else:
          numseqs = 1

        if("retfultxt" in js):
          retfultxt = js["retfultxt"]
        else:
          retfultxt = True

        print(colored("Received Data: {0}".format(txt), "yellow"))

        torch.cuda.empty_cache()
        print(colored("Generating text, please wait...", "green"))

        tokens = tokenizer(txt, return_tensors="pt").input_ids.to("cpu")
        ids = tokens.cuda()
                                                
        def generate(input_str, length=250, n=5):
          cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to("cuda")
          model.eval()
          with torch.no_grad():
            for i in range(length):
              outputs = model(cur_ids[:, -1024:], labels=cur_ids[:, -1024:])
              loss, logits = outputs[:2]
              softmax_logits = torch.softmax(logits[0,-1], dim=0)
              next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
              cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to("cuda") * next_token_id], dim=1)
            output_list = list(cur_ids.squeeze().to('cpu').numpy())
            output_text = tokenizer.decode(output_list)
            return output_text

        def choose_from_top(probs, n=5):
            ind = np.argpartition(probs, -n)[-n:]
            top_prob = probs[ind]
            top_prob = top_prob / np.sum(top_prob) # Normalize
            choice = np.random.choice(n, 1, p = top_prob)
            token_id = ind[choice][0]
            return int(token_id)

        genout = generate(txt, max).split("<|endoftext|>")[1]
        if(len(genout) > 0 and genout != ""):
          if(retfultxt):
            # Outdated client, send old JSON format
            print(colored("Generated Text: {0}".format(genout), "cyan"))
            response = app.response_class(
              response=json.dumps({"data": {"text": genout}}),
              status=200,
              mimetype='application/json'
            )
          else:
            # New client format with numseq support
            i = 0
            for seq in genout:
              print(colored("[Result {0}]\n{1}".format(i, seq), "cyan"))
              i += 1
            response = app.response_class(
              response=json.dumps({"data": {"seqs": genout}}),
              status=200,
              mimetype='application/json'
            )

          return response
        else:
          print(colored("[ERROR] Something went wrong during generation!", "red"))
          response = app.response_class(
            response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}),
            status=400,
            mimetype='application/json'
          )
        
        js         = {}
        tokens     = []
        ids        = []
        gen_tokens = []
        genout     = ""
        response   = {}

      except Exception as e:
        print(colored("[ERROR] Something went wrong during generation!", "red"))
        print(colored("{0}".format(e), "red"))
        response = app.response_class(
          response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}),
          status=400,
          mimetype='application/json'
        )

print(colored("Starup complete! Running web service.", "green"))
app.run()

[33mReceived Data:  What's on TV?[0m
[32mGenerating text, please wait...[0m


127.0.0.1 - - [04/Jul/2021 05:05:24] "[37mPOST /request HTTP/1.1[0m" 200 -


[36mGenerated Text: I think you mean football and the football game[0m
