# Solver error classification

This notebook demonstrates classification of solver errors in project Thoth, using tf-idf and k-means clustering.

To run this notebook in Jupyter hosted on datahub cluster, just enter AWS key id and AWS secret access key to access Thoth data on Ceph.

In [1]:
import os
import json
from pprint import pprint
import random

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from thoth.storages import SolverResultsStore

# Reproducible runs until there are no new documents.
ANSWER_TO_THE_ULTIMATE_QUESTION_OF_LIFE_THE_UNIVERSE_AND_EVERYTHING = 42
random.seed(ANSWER_TO_THE_ULTIMATE_QUESTION_OF_LIFE_THE_UNIVERSE_AND_EVERYTHING)
np.random.seed(ANSWER_TO_THE_ULTIMATE_QUESTION_OF_LIFE_THE_UNIVERSE_AND_EVERYTHING)


SOLVER_DOCUMENTS_CONSIDERED_COUNT = 2000
NUMBER_OF_CLUSTERS = 20
KMEANS_ITERATIONS = 200

In [2]:
solvers = SolverResultsStore(
    deployment_name="thoth-psi-stage",
    host="https://s3.upshift.redhat.com",
    bucket="thoth",
    prefix="data/thoth",
    key_id=os.environ["AWS_ACCESS_KEY_ID"],
    secret_key=os.environ["AWS_SECRET_ACCESS_KEY"]
)
solvers.connect()

In [3]:
all_solver_documents = list(solvers.get_document_listing())
random.shuffle(all_solver_documents)

In [4]:
solver_documents_considered = []
for solver_document_id in all_solver_documents:
    document = solvers.retrieve_document(solver_document_id)
    if len(document["result"]["errors"]) == 1 and document["result"]["errors"][0].get("is_provided_package_version") is True:
        solver_documents_considered.append(document)
        print(
            f"\rSolver documents considered for training {len(solver_documents_considered)}/{SOLVER_DOCUMENTS_CONSIDERED_COUNT}",
            end=""
        )

        if len(solver_documents_considered) == SOLVER_DOCUMENTS_CONSIDERED_COUNT:
            break

len(f"Solver documents considered for training {solver_documents_considered}")

Solver documents considered for creating clusters 2000/2000

2000

In [5]:
print("A sample of an error in training dataset:")
print(random.choice(solver_documents_considered)["result"]["errors"][0]["details"]["message"])

A sample of an error in training dataset:
Command exited with non-zero status code (1):     ERROR: Command errored out with exit status 1:
     command: /home/solver/venv/bin/python3 -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-nub51dqk/pystaticconfiguration/setup.py'"'"'; __file__='"'"'/tmp/pip-install-nub51dqk/pystaticconfiguration/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base /tmp/pip-install-nub51dqk/pystaticconfiguration/pip-egg-info
         cwd: /tmp/pip-install-nub51dqk/pystaticconfiguration/
    Complete output (10 lines):
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "/tmp/pip-install-nub51dqk/pystaticconfiguration/setup.py", line 7, in <module>
        import staticconf
      File "/tmp/pip-install-nub51dqk/pystaticconfiguration/staticconf/__init__.py", 

In [6]:
texts = [err["result"]["errors"][0]["details"]["message"] for err in solver_documents_considered]

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(texts)

In [8]:
model = KMeans(
    n_clusters=NUMBER_OF_CLUSTERS, 
    max_iter=KMEANS_ITERATIONS,
)

model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=200,
    n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [9]:
centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(NUMBER_OF_CLUSTERS):
    print(f"Centroids for cluster {i}.:"),
    for idx in centroids[i, :NUMBER_OF_CLUSTERS]:
        print(f"\t{terms[idx]}")

Centroids for cluster 0.:
	 pandas
	 tests
	 accesscontrol
	 x86_64
	 linux
	 io
	 build
	 lib
	 copying
	 numpy
	 data
	 src
	 macro
	 note
	 expansion
	 core
	 py
	 eggs
	 obj
	 py3
Centroids for cluster 1.:
	 django
	 pip
	 autocomplete
	 light
	 tmp
	 install
	 py
	 setup
	 command
	 mezzanine
	 __file__
	 sortedm2m
	 probably
	 fiber
	 bootstrap3
	 print
	 line
	 file
	 egg_info
	 egg
Centroids for cluster 2.:
	 build
	 x86_64
	 linux
	 lib
	 creating
	 copying
	 temp
	 gcc
	 src
	 py
	 running
	 install
	 protection
	 wp
	 fstack
	 pip
	 tmp
	 record
	 fpic
	 dependency_injector
Centroids for cluster 3.:
	 pip
	 tmp
	 install
	 py
	 setup
	 command
	 file
	 __file__
	 line
	 egg
	 egg_info
	 module
	 status
	 code
	 open
	 exit
	 tokenize
	 sys
	 exec
	 output
Centroids for cluster 4.:
	 19
	 18
	 16
	 17
	 13
	 11
	 10
	 14
	 12
	 15
	 20
	 21
	 23
	 24
	 22
	 satisfies
	 25
	 requirement
	 versions
	 matching
Centroids for cluster 5.:
	 10
	 12
	 11
	 13
	 14
	 satisfies
	 requ

In [12]:
# Find a sample and predict a cluster.

while True:
    sample_solver_document_id = random.choice(all_solver_documents)
    if (
        len(document["result"]["errors"]) != 1 \
        or document["result"]["errors"][0].get("is_provided_package_version") is not True
    ):
        continue

    sample_document = solvers.retrieve_document(sample_solver_document_id)
    sample = sample_document["result"]["errors"][0]["details"]["message"]
    print(f"Using sample {sample_solver_document_id!r} to predict cluster")
    break

Using sample 'solver-fedora-31-py38-b9c83b87' to predict cluster


In [13]:
print(sample)

Command exited with non-zero status code (1):   ERROR: Command errored out with exit status 1:
   command: /home/solver/venv/bin/python3 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-dcssl60v/yappi/setup.py'"'"'; __file__='"'"'/tmp/pip-install-dcssl60v/yappi/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /tmp/pip-wheel-9wmg8qsx --python-tag cp38
       cwd: /tmp/pip-install-dcssl60v/yappi/
  Complete output (14 lines):
  unable to execute 'cc': No such file or directory
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build/lib.linux-x86_64-3.8
  copying yappi/yappi.py -> build/lib.linux-x86_64-3.8
  running build_ext
  building '_yappi' extension
  creating build/temp.linux-x86_64-3.8
  creating build/temp.linux-x86_64-3.8/yappi
  gcc -pthread -Wno-unused-result -Wsign-compare -

In [14]:
X = vectorizer.transform([sample])

cluster = model.predict(X)[0]

print(f"Sample {sample_solver_document_id!r} belongs to cluster number {cluster}")

Sample 'solver-fedora-31-py38-b9c83b87' belongs to cluster number 18
