In [None]:
#@title pip-installs & stuff
#@markdown ## Install the required packages

!pip install sanic
# We use Sanic to create the http server

!pip install pyngrok
ngrok_token = "2yzWKXfe8GNcHdJWUxige_2YBbXvueHEYW17jM2E7hY" #@param {type:"string"}
#@markdown We need [ngrok](https://ngrok.com) to make the server accessible from the outside world via a public URL. If you don’t have an ngrok token, you can get one [here](https://dashboard.ngrok.com/tunnels/authtokens). Ngrok’s free tier is enough for [most testing purposes](https://ngrok.com/pricing).

# The rest of the packages are optional and depend on the model you are using
!pip install transformers
!pip install accelerate

from google.colab import drive
# drive.mount('/content/drive')
# (Uncomment the above line if you want to use Google Drive to store the model weights)

#@ ### You can run the Notebook now. Click `Runtime` -> `Run all` in the menu above.

In [None]:
#@title Download the model
#@markdown ## 👈 Copy the contents of `download.py` here
#@markdown **Note:** Ideally, you want to rewrite the models so that they are downloaded and later reused from Google Drive. This way, you won't need to download the model every time you start a runtime. However, the code will be specific to the model you are using, so we will leave it as an exercise for the notebook user.

# In this file, we define download_model_weights
# It runs during container build time to get model weights built into the container

# In this example: A Huggingface BERT model

from transformers import pipeline

def download_model_weights():

  try:

    weights_downloaded
    # Hereinafter, this trick allows us to avoid downloading the model whenever the cell is run. Once the model is downloaded for the first time, the variable `weights_downloaded` is set to True, so no error is raised and the model (which is downloaded in the except block) is not downloaded again.

  except NameError:
    
    # do a dry run of loading the huggingface model, which will download weights
    pipeline('fill-mask', model='bert-base-uncased')

    weights_downloaded = True

if __name__ == "__main__":
    download_model_weights()

In [None]:
#@ Define model init/inference functions
#@markdown ## 👈 Copy the contents of `app.py` here
 
from transformers import pipeline
import torch

# Init is ran on server startup
# Load your model to GPU as a global variable here using the variable name "model"
def init():
    global model

    try:
      model
      print("Model already loaded")
    except NameError:
    
      device = 0 if torch.cuda.is_available() else -1
      model = pipeline('fill-mask', model='bert-base-uncased', device=device)
      
      print("Model loaded")

# Inference is ran for every server call
# Reference your preloaded global model variable here.
def inference(model_inputs:dict) -> dict:
    global model

    # Parse out your arguments
    prompt = model_inputs.get('prompt', None)
    if prompt == None:
        return {'message': "No prompt provided"}
    
    # Run the model
    result = model(prompt)

    # Return the results as a dictionary
    return result

# If testing with Colab, define a user_src object which has attributes for init and inference
# Check if imported modules include google.colab
if 'google.colab' in sys.modules:
  class UserSrc:
    def __init__(self):
      self.init = init
      self.inference = inference

  user_src = UserSrc()

In [None]:
#@ Start the server
#@markdown ## 👈 Copy the contents of `server.py` here

import subprocess
import sys
from sanic import Sanic, response

try:
  port += 1
except NameError:
  port = 8000
  # (This is a hack to avoid "address already in use" errors when running the cell multiple times)

if 'google.colab' in sys.modules:

  Sanic._app_registry = {}
  # (We need this to remove the already created app if running the cell multiple times)

  # Start the ngrok tunnel

  try:
    ngrok_tunnel
    print("Tunnel already started")
    print("To delete the tunnel, run:\n\nngrok.disconnect(ngrok_tunnel)\ndel ngrok_tunnel")
  except NameError:

    from pyngrok import ngrok

    ngrok_tunnel = ngrok.connect(port)
    # The public URL will be printed to the console after this line, so look for it there
    # It will look like this:
    # <NgrokTunnel: "http://<some_id>.ngrok.io" -> "http://localhost:8000">

else:

  import app as user_src
  # (If testing with colab, the app interface is defined in the cell above)

# We do the model load-to-GPU step on server startup
# so the model object is available globally for reuse
user_src.init()

# Create the http server app.


server = Sanic("my_app")

# Healthchecks verify that the environment is correct on Banana Serverless
@server.route('/healthcheck', methods=["GET"])
def healthcheck(request):
  # dependency free way to check if GPU is visible
  gpu = False
  out = subprocess.run("nvidia-smi", shell=True)
  if out.returncode == 0: # success state on shell command
    gpu = True

  return response.json({"state": "healthy", "gpu": gpu})

# Inference POST handler at '/' is called for every http call from Banana
@server.route('/', methods=["POST"]) 
def inference(request):
  try:
    model_inputs = response.json.loads(request.json)
  except:
    model_inputs = request.json

  output = user_src.inference(model_inputs)

  return response.json(output)


if __name__ == '__main__':
  server.run(host='0.0.0.0', port=port, workers=1)

In [None]:
#@title That’s it!
#@markdown ### Now test your model by tunning `python test.py` in your *local* terminal. You will be prompted for the public URL of your server, which you can find in the console after running the cell above.