In [None]:
import warnings
warnings.filterwarnings('ignore')
from utility import Utility as util
import matplotlib.pyplot as plt
import pandas as pd
pd.options.display.max_columns = None
#pd.set_option('display.max_colwidth', None)
#pd.set_option('display.max_rows', None)
import warnings
import time
import os
#import pandasql as psql
from IPython.core.display import Markdown as md
warnings.filterwarnings('ignore')
#from libraries.project_reports import project_reports
#from libraries.utility import Utility as util
mutil = util()
#print(img_text_data)

from IPython.core.display import display, HTML, Markdown, Latex
display(HTML(
    '<style>'
        '#notebook { padding-top:0px !important; } ' 
        '.container { width:100% !important; } '
        '.end_space { min-height:0px !important; } '
        '.end_space { min-height:0px !important; } '
        '.prompt {width: 0px; min-width: 0px; visibility: collapse } '
        '.parent{'
        '    display: grid;'
        '    grid-template-columns: 1fr 1fr;'
        '    column-gap: 5px;'
        '}    '
    '</style>'
))

img_path = os.path.join(mutil.get_this_dir(),"artifacts","images","irwin_analytics_small.txt")
img_text_data = mutil.get_data_from_file(img_path)

img_tag = "<img align=\"left\" src=\"data:image/png;base64,{}\" />".format(img_text_data)
a_tag = "<a href=\"https://www.irwinanalytics.com\" target=\"https://www.irwinanalytics.com\">{}</a>".format(img_tag)

my_html = "<hr/>".format(img_tag,a_tag)
display(HTML(my_html))
my_html = "<div class='parent'><div>{}</div></div>".format(a_tag)
display(HTML(my_html))

## System Overview -- The Environment

Our streaming and transfomration solution is virtualized in this case.   

We create this utilizing a [yaml](docker-compose.yml,yml) file under the docker-compose framework.




In [None]:
img_path = os.path.join(mutil.get_this_dir(),"artifacts","images","docker_network.png")

img_tag = mutil.get_embedded_image_tag_from_image_file(img_path)

my_html = "<hr/>{}".format(img_tag)
display(HTML(my_html))


## System Overview -- The data flow

Below is a informal representation of the application's data flow.

In [None]:
img_path = os.path.join(mutil.get_this_dir(),"artifacts","images","streaming_visualization.jpg")

img_tag = mutil.get_embedded_image_tag_from_image_file(img_path)

my_html = "<hr/>{}".format(img_tag)
display(HTML(my_html))

## Using Presto to Generate Reports

To generate reports for those interested in application usage, we can use Presto via the PyHive connector to load our Hive tables into Pandas dataframes, transform the dataframes to answer our business questions, and then publish the reports as JSON files.

Before we start, let's define two business questions that we would like to answer:

1. What are all the counts per event type?
2. What are all the parameters that were given for the `user` parameter?

First, let's install the PyHive library.

In [None]:
%%capture output
!python3 -m pip install git+https://github.com/dropbox/PyHive.git >/dev/null

Next, let's use PyHive to connect to Presto in code using the port that we exposed in our Docker Compose file. Once we connect, we can run a simple query to see all the tables that are created in Hive.

In [None]:
from pyhive import presto
import pandas as pd

presto_conn = presto.connect(
    host='presto',
    port=8080 # Exposed Presto port (see docker compose file)
)

pd.read_sql_query("SHOW TABLES", presto_conn)

Now let's run a query to get all of the data from the `event_parameters` table and load it into a Pandas dataframe.

In [None]:
# https://stackoverflow.com/questions/55988436/how-to-convert-a-presto-query-output-to-a-python-data-frame
event_parameters = pd.read_sql_query("SELECT * from event_parameters", presto_conn)
event_parameters.tail()

Let's do the same thing for the data in the `all_events` table.

In [None]:
all_events = pd.read_sql_query("SELECT * from all_events", presto_conn)
all_events.tail()

Now let's try to answer business question #1 - What are all the counts per event type? We can do this using a simple groupby statement on our `all_events` dataframe and then writing the output to a JSON file titled `event_type_count.json`.

In [None]:
event_type_count = all_events.groupby('event_type').size()
event_type_count.to_json("event_type_count.json", orient='columns')

Note that `event_type_count.json` should exist in your directory after running the code above.


In [None]:
query_data = mutil.get_data_from_file(os.path.join(mutil.get_this_dir(),"hive_queries","all_events.sql"))

all_events = pd.read_sql_query(query_data, presto_conn)
all_events.head()

Now let's answer business question #2 - What are all the parameters that were given for the `user` parameter? We can do this by running a slightly more complex query on our dataframe and again writing the output to a JSON file.

In [None]:
# Question: What are all the parameters that were given for the `user` parameter?
user_parameter_count = event_parameters.where(event_parameters['parameter_name'] == 'user').groupby('parameter_value').size()
user_parameter_count.to_json("user_parameter_count.json", orient='columns')

query_data = mutil.get_data_from_file(os.path.join(mutil.get_this_dir(),"hive_queries","get_user_event_counts.hsql"))

events_by_user = pd.read_sql_query(query_data, presto_conn)

events_by_user.head(15)


## Turbo Charge Reporting -- Primative Visualization

Add smarter queries.
Add some graphs.

In [None]:
query_dir_dict = mutil.load_subdirs_into_dict(os.path.join(mutil.get_this_dir(),"hive_queries","report_queries"),"sql")

rep_for_display = dict()

for qd in query_dir_dict["report_queries"]:
    for key in qd.keys():
        if ".sql_content" in key:
            name = key.replace(".sql_content","")
            #if "GET EVENT TOTALS" in name:
            display(HTML(name))
            report_payload = pd.read_sql_query(qd[key], presto_conn)
            display(report_payload)
            rep_for_display[name]=report_payload
        

In [None]:
fig, axs = plt.subplots(1,2, figsize=(14, 7))
df = rep_for_display["GET EVENT TOTALS"]
df.index=df["event_type"]
display(HTML("GET EVENT TOTALS"))
df.plot.bar(x='event_type',y='event_count',rot=0,ax=axs[0])
df.plot.pie(y='event_count', rot=90,ax=axs[1]).get_legend().remove()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(14, 7))
df = rep_for_display["LOOK AT 10 MOST POPULAR SWORDS"]
df.index=df["sword_name"]
display(HTML("LOOK AT 10 MOST POPULAR SWORDS"))
df.plot.bar(x='sword_name',y='popular_sword_count',rot=45,ax=axs[0])
df.plot.pie(y='popular_sword_count', rot=45,ax=axs[1]).get_legend().remove()

In [None]:
fig, axs = plt.subplots(1,2, figsize=(14, 7))
df = rep_for_display["LOOK USER GUILD JOIN COUNT"]
display(HTML("LOOK USER GUILD JOIN COUNT"))

df.index=df["user_name"]
df.plot.bar(x='user_name',y='guid_join_count',rot=45,ax=axs[0])
df.plot.pie(y='guid_join_count', rot=45,ax=axs[1]).get_legend().remove()

Now that we have answered the business questions and generated JSON reports with the answers, we can close our Presto connection.

In [None]:
presto_conn.close()