In [2]:
import json
import os
from unskript import nbparams
from unskript.secrets import ENV_MODE, ENV_MODE_AWS
from unskript.fwk.workflow import Task, Workflow

env = {"ENV_MODE":"ENV_MODE_UNSKRIPT_HOSTED","TENANT_ID":"da0129a2-da0d-4d69-acc0-c3f251bf8b10","PROXY_ID":"0cbcf9dd-c2e6-4b54-a55a-4f5985cf87cb","TENANT_URL":"https://us.app.unskript.io","AWS_REGION":"us-west-2"}
secret_store_cfg = {"SECRET_STORE_TYPE":"SECRET_STORE_TYPE_AWS","AWS_SECRET_PREFIX":"","AWS_REGION":"us-west-2"}
os.environ["UNSKRIPT_REDIS_HOST"] = "redis-master.redis.svc.cluster.local"
os.environ["UNSKRIPT_TOKEN"] = "5c4a5754-0600-11ec-9a03-0242ac130003"
os.environ["TENANT_URL"] = env["TENANT_URL"]
paramDict = {"namespace":"da0129a2-da0d-4d69-acc0-c3f251bf8b10"}
paramDict.update(env)
paramDict.update(secret_store_cfg)
paramsJson = json.dumps(paramDict)
nbParamsObj = nbparams.NBParams(paramsJson)
namespace = nbParamsObj.get('namespace')

w = Workflow(env, secret_store_cfg, None, global_vars=globals(), check_uuids=None)



<hr><hr><center><img src="https://unskript.com/assets/favicon.png" alt="unSkript.com" width="100" height="100">
<h1 id="unSkript-Runbooks&para;">unSkript Runbooks<a class="jp-InternalAnchorLink" href="#unSkript-Runbooks" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#unSkript-Runbooks&para;" target="_self">&para;</a></h1>
<div class="alert alert-block alert-success">
<h3 id="Objective&para;">Objective<a class="jp-InternalAnchorLink" href="#Objective" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#Objective&para;" target="_self">&para;</a></h3>
<br><strong style="color: #000000;"><em>Fix K8s Pod in ImagePullBackOff State</em></strong></div>
</center>
<p>&nbsp;</p>
<center>
<h2 id="K8S-Pod-in-ImagePullBackOff-State&para;"><u>K8S Pod in ImagePullBackOff State</u><a class="jp-InternalAnchorLink" href="#K8S-Pod-in-CrashLoopBack-State" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#K8S-Pod-in-ImagePullBackOff-State&para;" target="_self">&para;</a></h2>
</center>
<h1 id="Steps-Overview&para;">Steps Overview<a class="jp-InternalAnchorLink" href="#Steps-Overview" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#Steps-Overview&para;" target="_self">&para;</a></h1>
<p>1)&nbsp;<a href="#1" target="_self" rel="noopener">Get list of pods in ImagePullBackOff State</a><br>2)&nbsp;<a href="#2" target="_self" rel="noopener">Extract Events of the pods</a><br>3)&nbsp;<a href="#3" target="_self" rel="noopener">Check registry accessibility</a></p>
<p>An <code>ImagePullBackOff</code> error occurs when a Pod startup fails to pull the specified image. The reasons could be Non-Existent of the repository or Permission to Access the repository issues. This runbook helps to walk through the steps involved in debugging such a Pod.</p>
<hr>

<h3 id="Get-List-of-Pods-in-ImagePullBackOff-State&para;"><a id="1" target="_self" rel="nofollow"></a>Convert namespace to String if empty<a class="jp-InternalAnchorLink" href="#Get-List-of-Pods-in-CrashLoopBackOff-State" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#Get-List-of-Pods-in-ImagePullBackOff-State&para;" target="_self">&para;</a></h3>
<p>This custom action changes the type of namespace from None to String only if no namespace is given</p>

In [3]:
if namespace==None:
    namespace=''


<h3 id="Get-List-of-Pods-in-CrashLoopBackOff-State"><a id="1" target="_self" rel="nofollow"></a>Get List of Pods in ImagePullBackOff State<a class="jp-InternalAnchorLink" href="#Get-List-of-Pods-in-CrashLoopBackOff-State" target="_self">&para;</a></h3>
<p>This action fetches a list of the pods in ImagePullBackOff State. This action will consider <code>namespace</code> as&nbsp;<strong> all&nbsp;</strong>if no namespace is given.</p>
<blockquote>
<p>This action takes the following parameters (Optional):&nbsp;<code>namespace</code></p>
</blockquote>
<blockquote>
<p>This action captures the following ouput: <code>imagepullbackoff_pods</code></p>
</blockquote>

In [4]:
#
# Copyright (c) 2022 unSkript.com
# All rights reserved.
#

from pydantic import BaseModel, Field
from typing import Optional, Tuple
from unskript.legos.utils import CheckOutput, CheckOutputStatus
from collections import defaultdict
import json
import pprint
import re

from beartype import beartype
@beartype
def k8s_get_pods_in_imagepullbackoff_state_printer(output):
    if output is None:
        return
    if isinstance(output, CheckOutput):
        print(output.json())
    else:
        pprint.pprint(output)


@beartype
def k8s_get_pods_in_imagepullbackoff_state(handle, namespace: str=None) -> Tuple:
    """k8s_get_list_of_pods_with_imagepullbackoff_state executes the given kubectl command to find pods in ImagePullBackOff State

        :type handle: object
        :param handle: Object returned from the Task validate method

        :type namespace: Optional[str]
        :param namespace: Namespace to get the pods from. Eg:"logging", if not given all namespaces are considered

        :rtype: Status, List of pods in CrashLoopBackOff State
    """
    if handle.client_side_validation != True:
        print(f"K8S Connector is invalid: {handle}")
        return str()
    kubectl_command ="kubectl get pods --all-namespaces | grep ImagePullBackOff | tr -s ' ' | cut -d ' ' -f 1,2"
    if namespace:
        kubectl_command = "kubectl get pods -n " + namespace + " | grep ImagePullBackOff | cut -d' ' -f 1 | tr -d ' '"
    response = handle.run_native_cmd(kubectl_command)
    if response is None or hasattr(response, "stderr") is False or response.stderr is None:
        print(
            f"Error while executing command ({kubectl_command}): {response.stderr}")
        return str()
    temp = response.stdout
    result = []
    res = []
    unhealthy_pods =[]
    unhealthy_pods_tuple = ()
    if not namespace:
        all_namespaces = re.findall(r"(\S+).*",temp)
        all_unhealthy_pods = re.findall(r"\S+\s+(.*)",temp)
        unhealthy_pods = [(i, j) for i, j in zip(all_namespaces, all_unhealthy_pods)]
        res = defaultdict(list)
        for key, val in unhealthy_pods:
            res[key].append(val)
    elif namespace:
        all_pods = []
        all_unhealthy_pods =[]
        all_pods = re.findall(r"(\S+).*",temp)
        for p in all_pods:
                unhealthy_pods_tuple = (namespace,p)
                unhealthy_pods.append(unhealthy_pods_tuple)
        res = defaultdict(list)
        for key, val in unhealthy_pods:
            res[key].append(val)
    if len(res)!=0:
        result.append(dict(res))
    if len(result) != 0:
        return (False, result)
    else:
        return (True, None)


task = Task(Workflow())
task.configure(credentialsJson='''{
    "credential_name": "unskript",
    "credential_type": "CONNECTOR_TYPE_K8S",
    "credential_id": "e131b971-8a51-4a4e-a93e-acae63f94242"
}''')
task.configure(inputParamsJson='''{
    "namespace": "namespace"
    }''')

task.configure(outputName="imagepullbackoff_pods")

task.configure(printOutput=True)
(err, hdl, args) = task.validate(vars=vars())
if err is None:
    task.execute(k8s_get_pods_in_imagepullbackoff_state, lego_printer=k8s_get_pods_in_imagepullbackoff_state_printer, hdl=hdl, args=args)

<h3 id="Create-List-of-commands-to-get-Events&para;">Create List of commands to get Events<a class="jp-InternalAnchorLink" href="#Examine-the-Events" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#Create-List-of-commands-to-get-Events&para;" target="_self">&para;</a></h3>
<p>Examine the output from Step 1👆,&nbsp; and create a list of commands for each pod in a namespace that is found to be in the ImagePullBackOff State</p>
<blockquote>
<p>This action captures the following ouput:&nbsp;<code>all_unhealthy_pods</code></p>
</blockquote>

In [5]:
all_unhealthy_pods = []
for each_pod_dict in imagepullbackoff_pods:
    if type(each_pod_dict)==list:
        for pod in each_pod_dict:
            for k,v in pod.items():
                if len(v)!=0:
                    nspace = k
                    u_pod = ' '.join([str(each_pod) for each_pod in v])
                    cmd = "kubectl describe pod "+u_pod+" -n "+nspace+" | grep -A 10 Events"
                    all_unhealthy_pods.append(cmd)
print(all_unhealthy_pods)

<h3 id="Gather-information-of-the-pods">Extract Events of the pods<a class="jp-InternalAnchorLink" href="#Gather-information-of-the-pods" target="_self">&para;</a></h3>
<p>This action describes events for a list of unhealthy pods obtained in Step 1.</p>
<blockquote>
<p>This action captures the following ouput: <code>describe_output</code></p>
</blockquote>

In [6]:
#
# Copyright (c) 2022 unSkript.com
# All rights reserved.
#

from pydantic import BaseModel, Field

from beartype import beartype
@beartype
def k8s_kubectl_command_printer(output):
    if output is None:
        return
    print(output)


@beartype
def k8s_kubectl_command(handle, kubectl_command: str) -> str:
    """k8s_kubectl_command executes the given kubectl command on the pod

        :type handle: object
        :param handle: Object returned from the Task validate method

        :type kubectl_command: str
        :param kubectl_command: The Actual kubectl command, like kubectl get ns, etc..

        :rtype: String, Output of the command in python string format or Empty String in case of Error.
    """
    if handle.client_side_validation != True:
        print(f"K8S Connector is invalid: {handle}")
        return str()
    result = handle.run_native_cmd(kubectl_command)
    if result is None or hasattr(result, "stderr") is False or result.stderr is None:
        print(
            f"Error while executing command ({kubectl_command}): {result.stderr}")
        return str()

    return result.stdout


task = Task(Workflow())
task.configure(credentialsJson='''{
    "credential_name": "unskript",
    "credential_type": "CONNECTOR_TYPE_K8S",
    "credential_id": "e131b971-8a51-4a4e-a93e-acae63f94242"
}''')
task.configure(continueOnError=True)
task.configure(inputParamsJson='''{
    "kubectl_command": "iter_item"
    }''')
task.configure(iterJson='''{
    "iter_enabled": true,
    "iter_list_is_const": false,
    "iter_list": "all_unhealthy_pods",
    "iter_parameter": "kubectl_command"
    }''')
task.configure(conditionsJson='''{
    "condition_enabled": true,
    "condition_cfg": "len(all_unhealthy_pods)!=0",
    "condition_result": true
    }''')
task.configure(outputName="describe_output")

task.configure(printOutput=True)
(err, hdl, args) = task.validate(vars=vars())
if err is None:
    task.execute(k8s_kubectl_command, lego_printer=k8s_kubectl_command_printer, hdl=hdl, args=args)

<h3 id="Convert-to-String&para;">Convert to String<a class="jp-InternalAnchorLink" href="#Examine-the-Events" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#Convert-to-String&para;" target="_self">&para;</a></h3>
<p>From the output from Step 2👆,&nbsp; we convert the dict output to a string format.</p>
<blockquote>
<p>This action captures the following ouput: <code>all_describe_info</code></p>
</blockquote>

<h3 id="Examine-the-Events&para;">Examine the Events<a class="jp-InternalAnchorLink" href="#Examine-the-Events" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#Examine-the-Events&para;" target="_self">&para;</a></h3>
<p>This Custom Action searches Known errors .&nbsp;The well known errors are listed in the error_msgs variable. If&nbsp;there is a new error message that was found, it can be added to the list.</p>

In [13]:
import re

"""
This Custom Action searches Known errors in the podEvents variable.
The well known errors are listed in the error_msgs variable. If
there is a new error message that was found, you can add it to this
list and the next run, the runbook will catch that error.
"""

def check_msg(msg, err):
    return re.search(err, msg)

error_msgs = ["repository (.*) does not exist or no pull access",
              "manifest for (.*) not found",
              "pull access denied, repository does not exist or may require authorization",
             "Back-off pulling image (.*)"]
cause_found = False
result = ''
for key, value in describe_output.items():
    for err in error_msgs:
        result = check_msg(value, err)
        if result is not None:
            print("PROBABLE CAUSE: ", f"{result.string}")
            cause_found = True

repoLocation = ''
if cause_found is False:
    print("ERROR MESSAGE : \n", all_describe_info)
else:
    try:
        repoLocation = result.groups()[0]
    except:
        pass
    else:
        print("Image Repo Location : ", repoLocation)

<h3 id="Create-List-of-commands-to-get-Exit-Code&para;">Check Registry Accessibility<a class="jp-InternalAnchorLink" href="#Examine-the-Events" target="_self" rel="noopener">&para;</a><a class="jp-InternalAnchorLink" href="#Create-List-of-commands-to-get-Exit-Code&para;" target="_self">&para;</a></h3>
<p>From the output from Step 2B👆check if the repoLocation is accessible.</p>

In [9]:
from IPython.display import Markdown as md

if repoLocation is not None:
    display(md(f"**Please verify Repo {repoLocation} is accessible from the K8S POD**"))

<h3 id="Conclusion">Conclusion<a class="jp-InternalAnchorLink" href="#Conclusion" target="_self">&para;</a></h3>
<p>In this Runbook, we were able to identify pods stuck in ImagePullBackOff State and examined the possible event that caused it's failure using unSkript's K8s actions. To view the full platform capabilities of unSkript please visit <a href="https://us.app.unskript.io" target="_blank" rel="noopener">us.app.unskript.io</a></p>