### Make sure to use Python3, since it has natural order in dictionaries

In [1]:
import sys

print(sys.executable)

/data/anaconda3/bin/python


### Control client

In [2]:
import requests

from requests_kerberos import HTTPKerberosAuth, DISABLED
from requests.status_codes import codes


class ControlClient:
    _RESULTS = "results"
    _COUNT = "count"
    _PAGE_SIZE = "page_size"

    def __init__(self, base_url):
        self.base_url = base_url
        self._auth = HTTPKerberosAuth(mutual_authentication=DISABLED)

    def get_object_by_name(self, url, name, to_field="name", **kwargs):
        if "params" not in kwargs:
            kwargs["params"] = {}
        params = {to_field: name, self._PAGE_SIZE: 1}
        kwargs["params"].update(**params)

        response = self.get(url, **kwargs)

        if response.status_code != codes.ALL_OK:
            raise ValueError("Invalid request: {}".format(response.text))

        try:
            parsed = response.json()
        except Exception as e:
            raise ValueError(
                "Got invalid json from response: {}. "
                "Original response test: {}".format(e, response.text)
            )

        if parsed[self._COUNT] == 0:
            raise ValueError("Object with name '{}' does not exist.".format(name))

        if parsed[self._COUNT] > 1:
            raise ValueError("Got multiple objects with name '{}'.".format(name))

        return parsed[self._RESULTS][0]

    def get(self, url, **kwargs):
        return self._request('get', url, **kwargs)

    def post(self, url, **kwargs):
        return self._request('post', url, **kwargs)

    def patch(self, url, **kwargs):
        return self._request('patch', url, **kwargs)

    def delete(self, url, **kwargs):
        return self._request('delete', url, **kwargs)

    def _request(self, method, url, **kwargs):
        # cast absolute to relative
        if url.startswith("/"):
            url = url[1:]

        full_url = requests.compat.urljoin(self.base_url, url)
        kwargs['auth'] = self._auth
        try:
            return requests.request(method, full_url, **kwargs)
        except Exception as e:
            raise ValueError(
                "Got error making HTTP {} request "
                "on url '{}', original error message: {}.".format(method.upper(), full_url, e)
            )


cc_prod = ControlClient("http://dmcontrol.host/api/")
cc_dev = ControlClient("http://dmcontrol-dev.host/api/")

### Basic tools

In [3]:
import os
import dill
import tqdm
import datetime
import subprocess


class ControlUtils(object):

    def __init__(self, prod=False):
        self._xapp_cache = {}
        if prod:
            self.client = cc_prod
        else:
            self.client = cc_dev

    @staticmethod
    def check_response(r, status):
        if r.status_code != status:
            raise Exception(
                "Bad request for url={} - status={}, reason='{}'".format(r.url, r.status_code, r.reason)
            )

    def get_user_projects(self):
        response = self.client.get(url="compose/project/", params={"page_size": 1000000})
        self.check_response(response, 200)
        projects = response.json()["results"]

        response = self.client.get(url="target/portrait/", params={"page_size": 1000000})
        self.check_response(response, 200)
        portraits = response.json()["results"]

        response = self.client.get(url="target/lookalike/", params={"page_size": 1000000})
        self.check_response(response, 200)
        lookalikes = response.json()["results"]

        response = self.client.get(url="target/sales_report/", params={"page_size": 1000000})
        self.check_response(response, 200)
        sales_reports = response.json()["results"]

        managed_project_names = set.union(
            set(_["project"] for _ in portraits),
            set(_["project"] for _ in lookalikes),
            set(_["project"] for _ in sales_reports)
        )

        return [p for p in projects if p["name"] not in managed_project_names]

    def get_compose_projects(self, max_items=1000000):
        response = self.client.get(url="compose/project/", params={"page_size": max_items})
        self.check_response(response, 200)
        projects = response.json()["results"]
        return [p for p in projects]

    def get_compose_project(self, proj_id):
        response = self.client.get(url="compose/project/{}/".format(proj_id), params={})
        self.check_response(response, 200)
        return response.json()

    def get_grinder_projects(self, max_items=1000000):
        response = self.client.get(url="grinder/project/", params={"page_size": max_items})
        self.check_response(response, 200)
        projects = response.json()["results"]
        return [p for p in projects]

    def get_export_projects(self, max_items=1000000):
        response = self.client.get(url="export/project/", params={"page_size": max_items})
        self.check_response(response, 200)
        projects = response.json()["results"]
        return [p for p in projects]

    def get_compose_jobs(self, max_items=100):
        response = self.client.get(url="compose/job/", params={"page_size": max_items})
        self.check_response(response, 200)
        return response.json()["results"]

    def get_compose_job(self, job_id):
        response = self.client.get(url="compose/job/{}/".format(job_id), params={})
        self.check_response(response, 200)
        return response.json()

# https://dmcontrol.host/api/compose/project/
# https://dmcontrol.host/api/grinder/project/

In [19]:
control = ControlUtils(prod=True)

In [20]:
import json
import six

In [6]:
jobs = [job for job in control.get_compose_jobs(max_items=1000) if job["status"] == "success"]
print(len(jobs))
# print(json.dumps(jobs, indent=2))

887


In [None]:
job = control.get_compose_job(2369243)
print(json.dumps(job, indent=2))

In [21]:
grinder_projects = control.get_grinder_projects()
print(len(grinder_projects))

45659


In [22]:
compose_projects = control.get_compose_projects()
print(len(compose_projects))

26274


In [23]:
export_projects = control.get_export_projects()
print(len(export_projects))

47523


In [None]:
my_prj = control.get_compose_project(48043) # [p for p in compose_projects if p["id"] == 45][0]
print(json.dumps(my_prj, indent=2))

In [24]:
# search_for = "grinder_desktop_dataset"
# search_for = "grinder_mobile_dataset"
# search_for = "grinder_mobile_dataset_expanded"
# search_for = "hid_dataset_0_1"
# search_for = "hid_dataset_2_0_dm8225"
# search_for = "hid_dataset_3_0"
# search_for = "hid_dataset_3_0__raw_partial"

tables = [
    "snb_ds_segmentation.grinder_test_dataset_v7_23_11",
    "snb_ds_segmentation.grinder_test_dataset_v6_08_10",
    "snb_ds_segmentation.grinder_test_dataset_v5_07_10",
    "snb_ds_segmentation.grinder_test_dataset_v4_04_10",
    "snb_ds_segmentation.grinder_test_dataset_v3_11_08",
    "dmgrinder_source.rng_dataset_v5_pb",
    "dmgrinder_source.rng_dataset_v5_ars_max_uniq_05",
    "dmgrinder_source.rng_dataset_v5_ars_avg_uniq_05",
    "dmgrinder_source.rng_dataset_v5_ars_sum_uniq_05",
    "dmgrinder_source.rng_dataset_v5_ars_max_per_uniq_05",
    "dmgrinder_source.rng_dataset_v5_ars_avg_per_uniq_05",
    "dmgrinder_source.rng_dataset_v5_ars_sum_per_uniq_05",
    "dmgrinder_source.rng_dataset_v5_ars_sum_per_05",
    "dmgrinder_source.rng_dataset_v5_ars_consolid_v2",
    "dmgrinder_source.rng_dataset_v5_ars2_1",
    "dmgrinder_source.rng_dataset_v5_test1",
    "dmgrinder_source.rng_dataset_v5",
    "dmgrinder_source.rng_dataset_v4",
    "dmgrinder_source.rng_dataset_v3",
    "dmgrinder_source.hid_dataset_desc__raw_partial",
    "dmgrinder_source.hid_dataset_desc_0",
    "dmgrinder_source.rng_dataset_v2",
    "dmgrinder_source.rng_dataset_v1",
]

def is_in(substring, obj):
    text = str(json.dumps(obj)).lower()
    return (
        str('"{}"'.format(substring)).lower() in text 
        or str('.{}"'.format(substring)).lower() in text
    )

In [None]:
unused = []
used = []
for fqtn in tables:
    db, table = fqtn.split(".")
    in_use = False

    for p in grinder_projects:
        if is_in(fqtn, p) or is_in(table, p):
            in_use = True
            print("{} in grinder project id: {}, name: {}, enabled: {}".format(fqtn, p["id"], p["name"], p["enabled"]))

    for p in compose_projects:
        if is_in(fqtn, p) or is_in(table, p):
            in_use = True
            print("{} in compose project id: {}, name: {}, enabled: {}".format(fqtn, p["id"], p["name"], p["enabled"]))

    for p in export_projects:
        if is_in(fqtn, p) or is_in(table, p):
            in_use = True
            print("{} in export project id: {}, name: {}, enabled: {}".format(fqtn, p["id"], p["name"], p["enabled"]))

    if not in_use:
        unused.append(fqtn)
    else:
        used.append(fqtn)

print("\nunused tables:\n{}".format(",\n".join(unused)))
print("\nin use tables:\n{}".format(",\n".join(used)))

In [60]:
found_grinder_projects = [
    p for p in grinder_projects
    if is_it(p)
]

print("found grinder projects, total {}, enabled {}".format(
    len(found_grinder_projects), len([p for p in found_grinder_projects if p["enabled"]])
))

total 0, enabled 0


In [62]:
found_compose_projects = [
    p for p in compose_projects
    if is_it(p)
]

print("found compose projects, total {}, enabled {}".format(
    len(found_compose_projects), len([p for p in found_compose_projects if p["enabled"]])
))

total 3, enabled 2


In [None]:
found_export_projects = [
    p for p in export_projects
    if is_it(p)
]

print("found export projects, total {}, enabled {}".format(
    len(found_export_projects), len([p for p in found_export_projects if p["enabled"]])
))

In [63]:
def get_xapp(p):
    jobs = p["config"]["jobs"]
    return [
        "{}/{}".format(job_n, job_conf["xapp"]) 
        for job_n, job_conf in six.iteritems(jobs)
    ]

for p in found_compose_projects:
    if p["enabled"]:
        print(
            "id: {}, name: {}, enabled: {}, xapp: {}"
            "".format(p["id"], p["name"], p["enabled"], get_xapp(p))
        )

id: 29923, name: CleanerHive_dmgrinder_source.hid_dataset_3_0__raw_partial, enabled: True, xapp: ['cleaner:CleanerHive/CleanerHive']
id: 15791, name: ETLFeatures-hid-dataset-3-0, enabled: True, xapp: ['JoinRawIntoHid/ETLFeatures', 'JoinFinalDataset/ETLFeatures']


In [64]:
for p in found_grinder_projects:
    print(
        "id: {}, name: {}, enabled: {}"
        "".format(p["id"], p["name"], p["enabled"])
    )

In [None]:
for p in found_export_projects:
    print(
        "id: {}, name: {}, enabled: {}"
        "".format(p["id"], p["name"], p["enabled"])
    )

In [None]:
# print(json.dumps(export_projects[42], indent=2))

search_list = [
    "device-socdem-age",
    "device-socdem-sex",
    "device-model-age",
    "device-model-gender",
    "DM-9038__agg_7__bin__cat_addCart",
    "hid-socdem-age", "hid-sample-age",
    "hid-socdem-sex", "hid-sample-sex",
]

def is_export_for(obj, substrings):
    text = str(json.dumps(obj)).lower()
    finds = [
        str('"{}"'.format(substring)).lower() in text 
        for substring in substrings
    ]
    return any(finds)

found_export_projects = [
    p for p in export_projects
    if is_export_for(p, search_list)
]

print("found export projects, total {}, enabled {}".format(
    len(found_export_projects), len([p for p in found_export_projects if p["enabled"]])
))

for p in found_export_projects:
    print(
        "id: {}, name: {}, enabled: {}"
        "".format(p["id"], p["name"], p["enabled"])
    )