## Setup

### Initialization

In [1]:
# upgrade thoth-lab from the master
# !pip install --upgrade -e "git://github.com/thoth-station/lab#egg=thoth-lab"

# unfortunately, jupyter-tools package already exists on PyPI, we are gonna have to solve this issue somehow
# !pip install --upgrade --pre -e "git://github.com/CermakM/jupyter-tools#egg=jupyter-tools" --exists-action w
# !pip install --upgrade --pre jupyter-require

# from jupyter_tools import utils

# utils.install_nbextension('jupyter_require')
# utils.load_nbextension('jupyter-require')

In [2]:
# ! pip install spacy
# !python -m spacy download en

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import io
import sys;

sys.path.insert(0, 'user-api')

In [5]:
from collections import Counter

# sci
import numpy as np
import pandas as pd

# net
import networkx as nx

# nltk
import spacy

# viz
from jupyter_require import require
from jupyter_require import notebook

from thoth.lab.viz import init_notebook_mode
from thoth.lab.viz import plot as plot_dependencies

from thoth.user_api.parsing import parse_log

In [6]:
init_notebook_mode()

<JupyterRequire.display.SafeScript object>

<JupyterRequire.display.SafeScript object>

In [7]:
%reload_ext jupyter_require
%reload_ext jupyter_tools

### Utilities

In [8]:
# installation states
_STATES = ['prepare', 'resolve', 'build', 'install', 'postface', 'ANOMALY']

# phrases identifying certain state
_DEFAULT_STATE_PHRASES = {
    'prepare': [
        'Processing', 'Collecting', 'Downloading', 'Using cached',
        'Requirement already satisfied'
    ],
    'build': [
        'Building wheels', 'Running', 'Stored in directory',
        'Successfully built'
    ],
    'resolve': [
        # this state produces incompatibility warnings,
        # but does not crach the build
        'not installed', 'incompatible'
    ],
    'install': [
        'Installing', 'Found existing', 'Uninstalling',
        'install', 'Successfully installed',
        'Successfully uninstalled'
    ],
    'postface': [
        'You are using pip version', 'You should consider upgrading'
    ]
}

# colors for the states
_DEFAULT_STATE_COLORS = {
    'prepare': '#314054',
    'build': '#146461',
    'resolve': '#4B824C',
    'install': '#A4933A',
    'postface': 'gray',
    'ANOMALY': 'red'
}

color_mapper = dict(zip(_STATES, _DEFAULT_STATE_COLORS))

In [9]:
%%load_style id="text-highlight"

div.output_area pre.prepare {
    color: white;
    background-color: #314054;
}

div.output_area pre.build {
    color: white;
    background-color: #146461;
}

div.output_area pre.resolve {
    color: white;
    background-color: #A4933A;
}

div.output_area pre.install {
    color: white;
    background-color: #4B824C;
}

div.output_area pre.postface {
    color: white;
    background-color: gray;
}

div.output_area pre.ANOMALY {
    color: white;
    background-color: red;
}

<JupyterRequire.display.SafeScript object>

In [10]:
def highlight(df, content: str = None, highlight: str = None, colors: list = None):
    from IPython.core.display import HTML

    html = []
    colors = colors or []
    
    if len(colors) > 0:
        assert len(colors) == len(df)
        
    line_template = """
        <span><pre style="background-color: {col};" class="{cls}">{idx: <3} | {content}</pre></span>
    """
        
    for idx, row in df.iterrows():
        line = line_template.format(
            col=colors[idx] if len(colors) > 0 else "",
            idx=idx,
            cls=row[highlight],
            content=row[content]
        )

        html.append(line)

    return HTML('<br>'.join(html))

### Data

In [11]:
with open('logs/thoth-lab.pip.failed.log', 'r') as f:
    log_failed = f.read()
    
with open('logs/thoth-lab.pip.failed.2.log', 'r') as f:
    log_failed_collect = f.read()
    
with open('logs/thoth-lab.pip.success.log', 'r') as f:
    log_success = f.read()
    
with open('logs/thoth-lab.pip.clean.log', 'r') as f:
    log_clean = f.read()

### Peak

#### Success

In [12]:
log_success

"Processing /home/macermak/RedHat/aicoe/lab\nCollecting networkx (from thoth-lab==0.0.3)\nCollecting requests (from thoth-lab==0.0.3)\n  Using cached https://files.pythonhosted.org/packages/7d/e3/20f3d364d6c8e5d2353c72a67778eb189176f08e873c9900e10c0287b84b/requests-2.21.0-py2.py3-none-any.whl\nCollecting pandas (from thoth-lab==0.0.3)\n  Using cached https://files.pythonhosted.org/packages/e6/de/a0d3defd8f338eaf53ef716e40ef6d6c277c35d50e09b586e170169cdf0d/pandas-0.24.1-cp36-cp36m-manylinux1_x86_64.whl\nCollecting plotly (from thoth-lab==0.0.3)\n  Using cached https://files.pythonhosted.org/packages/fd/db/003b5cfbc710f4d4982440451185b952269e4080a57ae7e760a2ceb8ce0c/plotly-3.6.1-py2.py3-none-any.whl\nCollecting decorator>=4.3.0 (from networkx->thoth-lab==0.0.3)\n  Using cached https://files.pythonhosted.org/packages/f1/cd/7c8240007e9716b14679bc217a1baefa4432aa30394f7e2ec40a52b1a708/decorator-4.3.2-py2.py3-none-any.whl\nCollecting idna<2.9,>=2.5 (from requests->thoth-lab==0.0.3)\n  Using 

#### Failed

In [13]:
log_failed

"Processing /home/macermak/RedHat/aicoe/lab\nCollecting networkx (from thoth-lab==0.0.3)\nCollecting requests (from thoth-lab==0.0.3)\n  Using cached https://files.pythonhosted.org/packages/7d/e3/20f3d364d6c8e5d2353c72a67778eb189176f08e873c9900e10c0287b84b/requests-2.21.0-py2.py3-none-any.whl\nCollecting pandas (from thoth-lab==0.0.3)\n  Using cached https://files.pythonhosted.org/packages/e6/de/a0d3defd8f338eaf53ef716e40ef6d6c277c35d50e09b586e170169cdf0d/pandas-0.24.1-cp36-cp36m-manylinux1_x86_64.whl\nCollecting plotly (from thoth-lab==0.0.3)\n  Using cached https://files.pythonhosted.org/packages/fd/db/003b5cfbc710f4d4982440451185b952269e4080a57ae7e760a2ceb8ce0c/plotly-3.6.1-py2.py3-none-any.whl\nCollecting decorator>=4.3.0 (from networkx->thoth-lab==0.0.3)\n  Using cached https://files.pythonhosted.org/packages/f1/cd/7c8240007e9716b14679bc217a1baefa4432aa30394f7e2ec40a52b1a708/decorator-4.3.2-py2.py3-none-any.whl\nCollecting idna<2.9,>=2.5 (from requests->thoth-lab==0.0.3)\n  Using 

#### Clean

In [14]:
log_clean



## Naive 1.0 -- pip3

### Definitions

In [15]:
from collections import OrderedDict


class DependencyGraph(nx.OrderedDiGraph):
    node_dict_factory = OrderedDict
    adjlist_dict_factory = OrderedDict

def get_root(tree):
    """Return root of the current graph, if any.

    By default, tree topology is considered as input,
    so if there are multiple roots, only the first one is returned.
    """
    root = None
    for node, d in tree.in_degree():
        root = node
        break;

    return root

In [16]:
def build_dependency_table(raw_data: str,
                           root: str = None,
                           source: str = 'from',
                           target: str = 'package') -> pd.DataFrame:
    """Build dependency table from raw data"""
    df = pd.io.json.json_normalize(
        parse_log(raw_data), record_path='result')
    
    df['target'] = df[target]
    df['source'] = df[source] \
        .apply(lambda r: r[0]['package'] if r else '')
    
    if not root:
        # try to guess the root by missing target package
        targets = set(df.target.unique())
        sources = set(df.source.unique())
        root_candidates = sources - targets
        
        if len(root_candidates) > 1:
            raise ValueError("No root specififed and multiple roots found: ", root)
        
        root, = root_candidates
        
    # Create root node
    d = pd.DataFrame({'source': '', 'target': root}, columns=df.columns, index=[-1])
    
    df = df.append(d).sort_index().reindex(sorted(df.columns), axis=1)

    if source != 'source':
        df.drop(source, axis=1, inplace=True)
    if target != 'target':
        df.drop(target, axis=1, inplace=True)
        
    return df
    

In [17]:
def build_dependency_graph(dep_table: pd.DataFrame):
    """Build dependency graph from dependency table."""

    df = dep_table[dep_table.source != '']

    nodes = df.source.append(df.target).unique()
    edges = list(zip(df.source, df.target))

    g = DependencyGraph()
    g.add_nodes_from(nodes)
    g.add_edges_from(edges)
    
    # root tree at top-level package
    tree = nx.bfs_tree(g, get_root(g))  # collecting is breadth first by default
    
    return tree

In [18]:
def classify_state(line, anomaly='ANOMALY'):
    """Assign state to the single log line."""
    import re
    
    line = line.strip()
    scores = {
        state: 0 for state in _STATES
    }
    scores[anomaly] = 0.5
    
    for state, phrases in _DEFAULT_STATE_PHRASES.items():
        scores[state] = sum(
            bool(re.search(r"\b{}\b".format(p), line, re.IGNORECASE))
            for p in phrases
        )
    
    return max([(score, key) for key, score in scores.items()])

def build_log_table(raw_data: "Union[List[str], str]"):
    """Create build log dataframe from raw data."""
    if isinstance(raw_data, str):
        raw_data = raw_data.strip().splitlines()
    
    states = [None] * len(raw_data)
    scores = [None] * len(raw_data)
    for idx, line in enumerate(raw_data):
        scores[idx], states[idx] = classify_state(line)

    df = pd.DataFrame({'line': raw_data, 'state': states, 'score': scores})
    df[df.state.isna()]
    
    return df

### Analysis

In [19]:
log = log_failed

In [20]:
%%suppress_warnings

dep_table = build_dependency_table(log)
dep_table

Unnamed: 0,already_satisfied,artifact,source,target,version_installed,version_specified
-1,,,,thoth-lab,,
0,,,thoth-lab,networkx,,
1,,,thoth-lab,requests,,
2,,,thoth-lab,pandas,,
3,,,thoth-lab,plotly,,
4,,,networkx,decorator,,>=4.3.0
5,,,requests,idna,,"<2.9,>=2.5"
6,,,requests,urllib3,,"<1.25,>=1.21.1"
7,,,requests,certifi,,>=2017.4.17
8,,,requests,chardet,,"<3.1.0,>=3.0.2"


Get installation state and set it as node attribute

In [21]:
dep_log = build_log_table(log)
dep_log['color'] = dep_log.state.map(color_mapper)

highlight(dep_log, content='line', highlight='state')

In [22]:
from collections import Counter

nlp = spacy.load('en')

def softmax(X): return np.exp(X) / np.sum(np.exp(X), axis=0)

def parse_log_anomalies(df: pd.DataFrame, g: DependencyGraph, top: int = None):
    """Parse log anomalies and try to match corresponding packages."""
    anomalies = df.query('state == "ANOMALY"').line
    
    if len(anomalies) <= 0:
        return [], []
    
    packages = []
    for line in anomalies:
        parsed = nlp(line)
        packages.extend([p.orth_ for p in parsed if g.has_node(p.orth_)])
        
    sp = sorted((s, p) for p, s in Counter(packages).items())[:top]
    scores, packages = zip(*sp)
    
    return softmax(scores), packages
    
g: DependencyGraph = build_dependency_graph(dep_table)
    
scores, packages = parse_log_anomalies(dep_log, g)    
scores, packages

build_breaker = packages[0] if packages else None

In [23]:
import re

root = get_root(g)
failed_branch = []

if build_breaker:
    failed_branch = nx.shortest_path(g, root, build_breaker)[:-1]

successfully_installed = set()
for node in nx.dfs_preorder_nodes(g, root):
    if re.match(re.escape(node), build_breaker or "", re.IGNORECASE):
        break
    successfully_installed.add(node)
    
successfully_installed = successfully_installed.difference(set(failed_branch))
successfully_installed

{'decorator', 'idna', 'networkx', 'urllib3'}

In [24]:
dep_table['color'] = dep_table.target.apply(
    lambda p: 'green' if p in successfully_installed else 'red'
)

# fix for awscli-plugin-endpoint) (1.14.25 invalid parsing
dep_table.source = dep_table.source.str.replace('awscli-plugin-endpoint\) \(1.14.25', 'awscli-plugin-endpoint')

plot_dependencies(dep_table)

## Report

In [25]:
log = log_failed

In [26]:
import json
from textwrap import dedent, indent

indentation_level = 4

report = """
Build breaker:

{info}     

Probable reason:

    {ln}: {reason}
"""

if build_breaker:
    build_breaker_info = dep_table.query(f"target == '{build_breaker}'")
    build_breaker_info_str = json.dumps(
        build_breaker_info.to_dict(orient='records')[0],
        indent=4,
        sort_keys=True
    )
    build_breaker_info_str = indent(
        build_breaker_info_str, ' ' * indentation_level)
    
    line_no, reason = next(
        dep_log.query("state == 'ANOMALY'")['line'][::-1].iteritems())
    
    print(
        report.format(info=build_breaker_info_str, ln=line_no, reason=reason)
    )

else:
    print("No build breaker identified.")



Build breaker:

    {
        "already_satisfied": null,
        "artifact": null,
        "color": "red",
        "source": "requests",
        "target": "certifi",
        "version_installed": null,
        "version_specified": ">=2017.4.17"
    }     

Probable reason:

    76: Cannot uninstall 'certifi'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.

