# Analyze links of invectio and solver outputs

## Load modules

In [1]:
from thoth.report_processing.components.solver import Solver
import pandas as pd

In [2]:
from pathlib import Path
current_path = Path.cwd()

## Access data

In [4]:
nonmatching_reports=Solver.aggregate_solver_results(repo_path=current_path.joinpath('thoth-solver-nonmatching-packages-data'), is_local=True)

INFO:thoth.report_processing.components.solver:Retrieving dataset at path... /root/thoth-solver-nonmatching-packages-data
INFO:thoth.report_processing.components.solver:Considering... /root/thoth-solver-nonmatching-packages-data/solver-rhel-8-py38-210713042252-a6a5cffc83e20e68
INFO:thoth.report_processing.components.solver:Considering... /root/thoth-solver-nonmatching-packages-data/solver-rhel-8-py38-210720145125-3d41d6cea9a4a58f
INFO:thoth.report_processing.components.solver:Considering... /root/thoth-solver-nonmatching-packages-data/solver-rhel-8-py38-210620185738-2f6d41d12536569f
INFO:thoth.report_processing.components.solver:Number of files retrieved is: 3


## Access all reports

In [5]:
solver_reports_metadata = []
for solver_document in nonmatching_reports:
    solver_reports_metadata.append(
        Solver.extract_data_from_solver_metadata(nonmatching_reports[solver_document]["metadata"])
    )

solver_reports_metadata_df = pd.DataFrame(solver_reports_metadata)

solver_reports_metadata_df.head()

Unnamed: 0,document_id,datetime,requirements,solver,os_name,os_version,python_interpreter,analyzer_version
0,solver-rhel-8-py38-210713042252-a6a5cffc83e20e68,2021-07-13T06:10:26.313602,scikit-learn===0.24.2,red hat enterprise linux-83-py38,red hat enterprise linux,83,3.8,1.10.1
1,solver-rhel-8-py38-210720145125-3d41d6cea9a4a58f,2021-07-20T15:42:18.379547,ipython===7.25.0,red hat enterprise linux-83-py38,red hat enterprise linux,83,3.8,1.10.1
2,solver-rhel-8-py38-210620185738-2f6d41d12536569f,2021-06-20T22:19:44.133615,python-dotenv===0.18.0,red hat enterprise linux-83-py38,red hat enterprise linux,83,3.8,1.10.1


In [6]:
solver_reports_extracted_data = []
solver_errors = []
for solver_document in nonmatching_reports:
    solver_report_extracted_data = Solver.extract_data_from_solver_metadata(
        nonmatching_reports[solver_document]["metadata"]
    )
    for k, v in nonmatching_reports[solver_document]["result"].items():
        solver_report_extracted_data[k] = v
        if k == "errors" and v:
            errors = Solver.extract_errors_from_solver_result(v)
            for error in errors:
                solver_errors.append(error)
    
    packages = Solver.extract_tree_from_solver_result(nonmatching_reports[solver_document]["result"])
    solver_report_extracted_data["packages"] = packages
    solver_reports_extracted_data.append(solver_report_extracted_data)

In [7]:
solver_reports_metadata_df = pd.DataFrame(solver_reports_extracted_data)
solver_reports_metadata_df.head(10)

Unnamed: 0,document_id,datetime,requirements,solver,os_name,os_version,python_interpreter,analyzer_version,environment,environment_packages,errors,platform,tree,unparsed,unresolved,packages
0,solver-rhel-8-py38-210713042252-a6a5cffc83e20e68,2021-07-13T06:10:26.313602,scikit-learn===0.24.2,red hat enterprise linux-83-py38,red hat enterprise linux,83,3.8,1.10.1,"{'implementation_name': 'cpython', 'implementa...","[{'package_name': 'pipdeptree', 'package_versi...",[],linux-x86_64,"[{'dependencies': [{'extra': [], 'extras': [],...",[],[],"[{'package_name': 'scikit-learn', 'package_ver..."
1,solver-rhel-8-py38-210720145125-3d41d6cea9a4a58f,2021-07-20T15:42:18.379547,ipython===7.25.0,red hat enterprise linux-83-py38,red hat enterprise linux,83,3.8,1.10.1,"{'implementation_name': 'cpython', 'implementa...","[{'package_name': 'pipdeptree', 'package_versi...",[],linux-x86_64,"[{'dependencies': [{'extra': [], 'extras': [],...",[],[],"[{'package_name': 'ipython', 'package_version'..."
2,solver-rhel-8-py38-210620185738-2f6d41d12536569f,2021-06-20T22:19:44.133615,python-dotenv===0.18.0,red hat enterprise linux-83-py38,red hat enterprise linux,83,3.8,1.10.1,"{'implementation_name': 'cpython', 'implementa...","[{'package_name': 'pipdeptree', 'package_versi...",[],linux-x86_64,"[{'dependencies': [{'extra': [], 'extras': [],...",[],[],"[{'package_name': 'python-dotenv', 'package_ve..."


Compared to the previous the solver-dataset v1.0 in this we have extra output of:
- `platform` field 
- `packages` used in **tree** column


In [18]:
solver_reports_metadata_df.loc[0]['tree'][0]['packages']

['sklearn',
 'sklearn.__check_build',
 'sklearn._build_utils',
 'sklearn._loss',
 'sklearn.cluster',
 'sklearn.compose',
 'sklearn.covariance',
 'sklearn.cross_decomposition',
 'sklearn.datasets',
 'sklearn.decomposition',
 'sklearn.ensemble',
 'sklearn.experimental',
 'sklearn.externals',
 'sklearn.feature_extraction',
 'sklearn.feature_selection',
 'sklearn.gaussian_process',
 'sklearn.impute',
 'sklearn.inspection',
 'sklearn.linear_model',
 'sklearn.manifold',
 'sklearn.metrics',
 'sklearn.mixture',
 'sklearn.model_selection',
 'sklearn.neighbors',
 'sklearn.neural_network',
 'sklearn.preprocessing',
 'sklearn.semi_supervised',
 'sklearn.svm',
 'sklearn.tests',
 'sklearn.tree',
 'sklearn.utils',
 'sklearn._loss.tests',
 'sklearn.cluster.tests',
 'sklearn.compose.tests',
 'sklearn.covariance.tests',
 'sklearn.cross_decomposition.tests',
 'sklearn.datasets.tests',
 'sklearn.decomposition.tests',
 'sklearn.ensemble._hist_gradient_boosting',
 'sklearn.ensemble.tests',
 'sklearn.experim

In [19]:
solver_reports_metadata_df.loc[1]['tree'][0]['packages']

['IPython',
 'IPython.core',
 'IPython.extensions',
 'IPython.external',
 'IPython.kernel',
 'IPython.lib',
 'IPython.sphinxext',
 'IPython.terminal',
 'IPython.testing',
 'IPython.utils',
 'IPython.core.magics',
 'IPython.core.tests',
 'IPython.extensions.tests',
 'IPython.external.decorators',
 'IPython.lib.tests',
 'IPython.terminal.pt_inputhooks',
 'IPython.terminal.tests',
 'IPython.testing.plugin',
 'IPython.testing.tests',
 'IPython.utils.tests']

In [20]:
solver_reports_metadata_df.loc[2]['tree'][0]['packages']

['dotenv']

Point of interest is the discrepancy of aforementioned data and module names on pypi.org. For example:
- https://pypi.org/project/scikit-learn/
- https://pypi.org/project/ipython/ 
- https://pypi.org/project/python-dotenv/ 

## Invectio output

In order to link the solver output to invectio output let's analyze result of `invectio whatuses .` command on the following test python file

In [None]:
from sklearn import linear_model as lm
import IPython
import dotenv
import plone.app.upgrade

plone_app_test = plone.app.upgrade.alias_module()
config = dotenv.dotenv_value(".env")
jobs = IPython.backgroundjobs.BackgroundJobManager()
lm.LinearRegression

result:

In [None]:
    "tests/data/app_10_test.py": {
      "IPython": [
        "IPython.backgroundjobs.BackgroundJobManager"
      ],
      "dotenv": [
        "dotenv.dotenv_value"
      ],
      "plone": [
        "plone.app.upgrade.alias_module"
      ],
      "sklearn": [
        "sklearn.linear_model.LinearRegression"
      ]
    },

As we can see the name of modules will not match those from pypi: 
- ipython
- python-dotenv
- scikit-learn

## Possible link

To take a look on how invectio and solver datasets can be connected let's start with

In [22]:
solver_reports_metadata_df.loc[0]['tree'][0]['importlib_metadata']['metadata']

{'Classifier': ['Intended Audience :: Science/Research',
  'Intended Audience :: Developers',
  'License :: OSI Approved',
  'Programming Language :: C',
  'Programming Language :: Python',
  'Topic :: Software Development',
  'Topic :: Scientific/Engineering',
  'Development Status :: 5 - Production/Stable',
  'Operating System :: Microsoft :: Windows',
  'Operating System :: POSIX',
  'Operating System :: Unix',
  'Operating System :: MacOS',
  'Programming Language :: Python :: 3',
  'Programming Language :: Python :: 3.6',
  'Programming Language :: Python :: 3.7',
  'Programming Language :: Python :: 3.8',
  'Programming Language :: Python :: 3.9',
  'Programming Language :: Python :: Implementation :: CPython',
  'Programming Language :: Python :: Implementation :: PyPy'],
 'Download-URL': 'https://pypi.org/project/scikit-learn/#files',
 'Home-page': 'http://scikit-learn.org',
 'License': 'new BSD',
 'Maintainer': 'Andreas Mueller',
 'Maintainer-email': 'amueller@ais.uni-bonn.de'

Here we have both URL to the package in `Download-URL` and name in `Name`

In [23]:
solver_reports_metadata_df.loc[0]['tree'][0]['importlib_metadata']['metadata']['Name']

'scikit-learn'

Further, we can use function  `get_source_repos()` from [app.py](https://github.com/thoth-station/solver-project-url-job/blob/master/app.py) from [solver-project-url-job repository](https://github.com/thoth-station/solver-project-url-job) to match the output from `invectio whatuses .` to package name in `solver_reports_metadata_df.loc[0]['tree'][0]['packages']`. 
And match it in turn to `solver_reports_metadata_df.loc[0]['tree'][0]['importlib_metadata']['metadata']['Name']`
Refer below.

    "tests/data/app_10_test.py": {
      "IPython": [
        "IPython.backgroundjobs.BackgroundJobManager"
      ],
      "dotenv": [
        "dotenv.dotenv_value"
      ],
      "plone": [
        "plone.app.upgrade.alias_module"
      ],
      "sklearn": [
        "sklearn.linear_model.LinearRegression"
      ]
    },

In [25]:
solver_reports_metadata_df.loc[0]['tree'][0]['packages']

['sklearn',
 'sklearn.__check_build',
 'sklearn._build_utils',
 'sklearn._loss',
 'sklearn.cluster',
 'sklearn.compose',
 'sklearn.covariance',
 'sklearn.cross_decomposition',
 'sklearn.datasets',
 'sklearn.decomposition',
 'sklearn.ensemble',
 'sklearn.experimental',
 'sklearn.externals',
 'sklearn.feature_extraction',
 'sklearn.feature_selection',
 'sklearn.gaussian_process',
 'sklearn.impute',
 'sklearn.inspection',
 'sklearn.linear_model',
 'sklearn.manifold',
 'sklearn.metrics',
 'sklearn.mixture',
 'sklearn.model_selection',
 'sklearn.neighbors',
 'sklearn.neural_network',
 'sklearn.preprocessing',
 'sklearn.semi_supervised',
 'sklearn.svm',
 'sklearn.tests',
 'sklearn.tree',
 'sklearn.utils',
 'sklearn._loss.tests',
 'sklearn.cluster.tests',
 'sklearn.compose.tests',
 'sklearn.covariance.tests',
 'sklearn.cross_decomposition.tests',
 'sklearn.datasets.tests',
 'sklearn.decomposition.tests',
 'sklearn.ensemble._hist_gradient_boosting',
 'sklearn.ensemble.tests',
 'sklearn.experim

In [26]:
solver_reports_metadata_df.loc[0]['tree'][0]['importlib_metadata']['metadata']['Name']

'scikit-learn'