In [18]:
import os, ast
from elasticsearch import Elasticsearch
import pprint
import datetime
import numpy as np

In [36]:
# Set elasticsearch URL host
es = Elasticsearch(hosts = [{"host" : "192.168.188.38", "port" : 9200}])

# Set index within elasticsearch host
es_index = "wixed-production_8707cb7b-738e-4507-9899-8bac0f49c8fc"

In [85]:
def queryData(index, timestamp=None, ival="day", timestampasstring=False) :
    '''
    Function for querying URL index using elasticsearch 


    Variables
    ---------
    index             : Reference to host sub group
    timestamp         : Specify index time stamp
    ival              : Specify aggregation interval
    timestampasstring : String version of time stamp

    Returns
    ---------
    res               : Aggregation results
    '''

    #index = es_index 
    #timestamp=None
    #ival="day"
    #timestampasstring=False
    
    DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"

    # If inputted, edit timestamp with specified format
    if timestamp is not None: 

        timestamp = timestamp.strftime(DATETIME_FORMAT)

    # Extract sorted timestamp from host index
    else: 

        res_timestamp = es.search(
            index = index,
            body={
                "query":{
                    "bool": {
                        "must": {
                            "match_all": {}
                        },
                    }
                },
                # Sort documents by timestamp, ignoring fields with no timestamp
                "sort": {"@timestamp": {"unmapped_type" : "long"}},
                "size": 1,
            }
        )

        # Error handling if no results are returned
        if res_timestamp["hits"]["hits"]:
            timestamp = res_timestamp["hits"]["hits"][0]["_source"]["@timestamp"]

    #print(timestamp, type(timestamp))
    # timestamp = "2017-04-01T00:00:00.000Z"
    #timestamp = "13/Jun/2017:00:00:00 +0000"

    # if timestamp is None:
    #     bool_query = {"bool": {
    #                       "must": {
    #                         "match_all": {}
    #                       },
    #                     }request_type_get       471.0
    #                     }
    # else:

    # Creater query to extract greater than extracted timestamp *most recent
    bool_query = {"bool": {
                    "must": {
                        "match_all": {}
                            },
                            "filter": {
                                "range": {
                                    "@timestamp" : {
                                        "gt" : timestamp,
                                                    }
                                        }
                    },
                }
            }

    # Aggregation query groups timestamps into histogram
    time_ival = {
        "date_histogram" : {
            "field" : "@timestamp",
            "interval" : ival,
            "min_doc_count" : 0, # to get buckets even when empty
            "extended_bounds": { # get buckets for the complete time span requested
                "min": timestamp,
                "max": datetime.datetime.utcnow().strftime(DATETIME_FORMAT)
            }
        }
    }

    # Use ES to aggregate most recent timestamps within time intervals across requests
    res = es.search(
        index = index,
        body={
            "query": bool_query,
            "sort": {"@timestamp": {"unmapped_type" : "long"}},
            "size": 0,
            "aggs":  {
                "by_request_type": { # Top level aggregation: Group by request type
                    "terms": {
                        "field": "verb.keyword",
                    },
                    "aggs": { # Sub-aggregations : Group by time interval
                        "requests_per_ival": time_ival
                    }
                },
                "requests_per_ival": time_ival, # number of requests in histogram interval
                # "error_request_log": { # Top level aggregation
                #     "terms": {
                #         "field": "error_log":
                #     },
                "error_request_log": {
                    "filter": {
                        "match": {
                            "error_log": "true"
                        }
                    },
                    "aggs": { # Sub-aggregations of error log
                        "requests_per_ival": time_ival
                    }
                },

            },
            # "query": {
            #     "wildcard": {
            #         "request": "*error*"
            #     }
            # }
        }
        )

        
    return res

#res_sub = {"total": res['hits']['total'], "aggregations": res["aggregations"]}
#print(res_sub)

#pp = pprint.PrettyPrinter(indent=2)
#pp.pprint(res)

#print(("%d documents found" % res['hits']['total'])#"ignore_unmapped" : true)
#for doc in res['hits']['hits']:
#    print("%s) %s" % (doc['_id'], doc['_source']))

    



In [None]:
def getData( index=es_index, timestamp=None, ival="day", timestampasstring=False):
    '''
    Using various parsing functions to manipulate the data in different ways 
    
    Parsing functions
    ------------
    parseData  :
    parseData2 :
    parseData3 :
    '''
    return parseData3(queryData(index, timestamp, ival, timestampasstring), timestampasstring)


In [None]:
def parseData (res):

    total_requests = res["aggregations"]["requests_per_ival"]["buckets"]
    request_types = res["aggregations"]["by_request_type"]["buckets"]
    error_logs = res["aggregations"]["error_request_log"]["requests_per_ival"]["buckets"]

    request_counts = {"requests": [], "error_log": [], "timestamp": []}
    #request_counts["timestamp"] = []


    for request in total_requests:
        request_counts["requests"].append(request["doc_count"])
        request_counts["timestamp"].append(request["key_as_string"])

    for bucket in error_logs:
        request_counts["error_log"].append(bucket["doc_count"])

    #print request_counts


    for request in request_types:
        request_type = request["key"].lower()
        request_counts[request_type]= []

        for bucket in request["requests_per_ival"]["buckets"]:
            request_counts[request_type].append(bucket["doc_count"])

    return request_counts

In [None]:
def parseData2 (res):


    request_counts = []

    for request in total_requests:
        #request_counts.append({"requests":[, request["doc_count"]})
        request_counts.append({"requests":[request["key_as_string"], request["doc_count"]]})

    print "no. of all requests buckets: {}".format(len(total_requests))
    #
    #
    for request in request_types:
        request_type = request["key"].lower()
        #request_counts[request_type]= []
        print "no. of {} requests buckets: {}".format(request_type, len(request["requests_per_ival"]["buckets"]))

        for ix, bucket in enumerate(request["requests_per_ival"]["buckets"]):
            request_counts[ix][request_type] = [bucket["key_as_string"],bucket["doc_count"]]

    #pp = pprint.PrettyPrinter(indent=2)
    #pp.pprint(request_counts)

    return request_counts

In [None]:
def parseData3 (res, timestampAsString=False):

    if res["hits"]["total"] == 0:
        return {}

    total_requests = res["aggregations"]["requests_per_ival"]["buckets"]
    request_types = res["aggregations"]["by_request_type"]["buckets"]
    error_logs = res["aggregations"]["error_request_log"]["requests_per_ival"]["buckets"]

    request_counts = []

    for request in total_requests:
        if timestampAsString:
            request_counts.append({
                "timestamp": request["key_as_string"],
                "requests":request["doc_count"]})
        else:
            request_counts.append({
                "timestamp": datetime.datetime.strptime(request["key_as_string"], DATETIME_FORMAT),
                "requests":request["doc_count"]})

    for ix, bucket in enumerate(error_logs):
        request_counts[ix]["error_log"] = bucket["doc_count"]


    print "no. of all requests buckets: {}".format(len(total_requests))
    #
    #
    for request in request_types:
        request_type = request["key"].lower()
        #request_counts[request_type]= []
        print "no. of {} requests buckets: {}".format(request_type, len(request["requests_per_ival"]["buckets"]))

        for ix, bucket in enumerate(request["requests_per_ival"]["buckets"]):
            request_counts[ix][request_type] = bucket["doc_count"]

    # pp = pprint.PrettyPrinter(indent=2)
    # pp.pprint(request_counts)

    return request_counts

In [None]:
def getPercentiles (index=es_index, timestamp=None, ival="day", timestampasstring=False):
    """
    Though elasticsearch offers to get percentiles, this doesn't work on
    doc_count of aggregated data, see https://github.com/elastic/elasticsearch/issues/7703
    """

    res = parseData(queryData(index, timestamp, ival, timestampasstring))
    request_perc = {}

    for key in res:
        if key != "timestamp" and res[key]:
            request_perc[key] = np.percentile(np.array(res[key]),99)

    # total_requests = res["aggregations"]["requests_per_ival"]["buckets"]
    # request_perc = []
    #
    # for bucket in total_requests:
    #     request_perc.append(bucket["doc_count"])
    # request_perc = np.array(request_perc)

    #return np.percentile(request_perc,99)
    return request_perc