In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import dash
import dash_core_components as dcc
from dash import html
from dash.dependencies import Input, Output, State
import plotly.graph_objs as go
from datetime import datetime
import time

In [None]:
spark = SparkSession.builder.appName("log_analytics").\
config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector_2.12:3.0.0,com.datastax.spark:spark-cassandra-connector-driver_2.12:3.0.0").\
config("spark.cassandra.connection.host","cassandra").\
config("spark.cassandra.auth.username","cassandra").\
config("spark.cassandra.auth.password","cassandra").\
getOrCreate()

In [3]:
cassandra_host = "cassandra"
cassandra_user = "cassandra"
cassandra_pwd  = "cassandra"
cassandra_port = 9042
key_space      = "loganalysis"
table_name     = "nasalog"

In [8]:
def read_cassandra(filter_condition, group_by, limit=False):
    logs_df = spark\
             .read\
             .format("org.apache.spark.sql.cassandra")\
             .options(table=table_name, keyspace=key_space)\
             .load()\
             .filter(filter_condition)
    agg_df =logs_df.groupBy(group_by).count().sort(group_by)
    if limit:
        return agg_df.limit(5).toPandas()
    else:
        return agg_df.toPandas()

In [9]:
schema="host string , time string , method string, path string , status string, content_size string"

def unique_hosts(group_by, time_format=None, filter_resp=False):
    logs_df = spark\
             .read\
             .csv("hdfs://namenode:8020/output/nasa_logs/",schema=schema)
    if time_format:
        logs_df = logs_df.withColumn(group_by,date_format(from_unixtime(col("time")),time_format))#.orderBy("time")
    if filter_resp:
        logs_df = logs_df.filter("response==404")
    agg_df =logs_df.limit(80000).groupBy(group_by).count().sort(group_by)
    return agg_df.toPandas()

In [10]:
app = dash.Dash(__name__, suppress_callback_exceptions=True)

app.layout = html.Div([
    dcc.Location(id='url', refresh=False),
    html.Div(id='page-content')
], style={'textAlign': 'center'})

In [11]:
#Color assignment
colors = {
    'background': 'white',#'#0C0F0A',
    'text': '#FFFFFF'
}

def create_header(title):
    header_style = {
        'background-color' : '#1B95E0',
        'padding' : '1.5rem',
        'color': 'white',
        'font-family': 'Verdana, Geneva, sans-serif'
    }
    header = html.Header(html.H1(children=title, style=header_style))
    return header

def generate_table(df, max_rows=10):
    table = html.Table(className="responsive-table",
                      children=[
                          html.Thead(
                              html.Tr(
                                  children=[html.Th(col.title()) for col in df.columns.values]
                                  
                                  ),style={'border':'1px black solid'}
                              ),
                          html.Tbody(
                              [
                              html.Tr(
                                  children=[html.Td(data) for data in d]
                                  )
                               for d in df.values.tolist()],style={'border':'1px black solid'})
                          ]
                       , style={'marginLeft': 'auto', 'marginRight': 'auto'}
    )
    
    return table

In [12]:
index_page = html.Div([
    html.Div([create_header('Log Analysis - Dashboard')]),
    dcc.Link('Go to Realtime Dash Board', href='/real-time'),
    html.Br(),
    dcc.Link('Go to Hourly Dash Board', href='/hourly'),
    html.Br(),
    dcc.Link('Go to Daily Dash Board', href='/daily'),
])

realtime_dashboard = html.Div(style={'backgroundColor': colors['background']}, children=
    [   
        html.Div([create_header('Log Analysis - Realtime Dashboard')]),
        html.Div([dcc.Graph(id='live-graph', animate=False)
                 ]
                 ,style={'width': '100%', 'display': 'inline-block'}
                ),
        html.Div([dcc.Graph(id='live-graph1', animate=False)
                 ]
                 ,style={'width': '100%', 'display': 'inline-block'}
                ),
        html.Div([html.H2("Top Paths"), 
                  html.Div(id="top-paths-table")]
                 ,style={'width': '50%', 'display': 'inline-block', 'border':'2px black solid'}
                ),
        ##Intervals define the frequency in which the html element should be updated
        dcc.Interval(id='graph-update',interval=60*1000, n_intervals=0),
        html.Div(id='real-time-content'),
        html.Br(),
        dcc.Link('Go to Hourly Dash Board', href='/hourly'),
        html.Br(),
        dcc.Link('Go to Daily Dash Board', href='/daily'),
        html.Br(),
        dcc.Link('Go back to home', href='/')
    ]
)

hourly_dashboard = html.Div(style={'backgroundColor': colors['background']}, children=
    [   
        html.Div([create_header('Log Analysis - Hourly Dashboard')]),
        html.Div([dcc.Graph(id='hourly-graph', animate=False)
                 ]
                 ,style={'width': '100%', 'display': 'inline-block'}
                ),
        ##Intervals define the frequency in which the html element should be updated
        dcc.Interval(id='hourly-graph-update',interval=60*1000, n_intervals=0),
        html.Div(id='hourly-content'),
        html.Br(),
        dcc.Link('Go to Daily Dash Board', href='/daily'),
        html.Br(),
        dcc.Link('Go to RealTime Dash Board', href='/real-time'),
        html.Br(),
        dcc.Link('Go back to home', href='/')
    ]
)

daily_dashboard = html.Div(style={'backgroundColor': colors['background']}, children=
    [   
        html.Div([create_header('Log Analysis - Daily Dashboard')]),
        html.Div([dcc.Graph(id='daily-graph', animate=False)
                 ]
                 ,style={'width': '100%', 'display': 'inline-block'}
                ),
        html.Div([dcc.Graph(id='daily-graph1', animate=False)
                 ]
                 ,style={'width': '100%', 'display': 'inline-block'}
                ),
        ##Intervals define the frequency in which the html element should be updated
        dcc.Interval(id='daily-graph-update',interval=60*1000, n_intervals=0),
        html.Div(id='daily-content'),
        html.Br(),
        dcc.Link('Go to RealTime Dash Board', href='/real-time'),
        html.Br(),
        dcc.Link('Go to Hourly Dash Board', href='/hourly'),
        html.Br(),
        dcc.Link('Go back to home', href='/')
    ]
)

In [13]:
#Call back for live graph
@app.callback(Output('live-graph', 'figure'),
              Input('graph-update', 'n_intervals')
             )
def update_graph_scatter(n_intervals):
    try:
        processed_time = 0
        filter_condition = "CAST(status AS DECIMAL) IS NOT NULL and time >'"+str(processed_time)+"'"
        group_by = 'status'
        df = read_cassandra(filter_condition,group_by)
        processed_time = time.time()-60
        df.dropna(inplace=True)
        
        #Define X and Y axis values        
        X = df["status"]
        Y = df['count']
        
        #Scatter graph definition
        data = go.Scatter(
                x=X,
                y=Y
                )

        return {'data': [data],'layout' : go.Layout(xaxis=dict(range=[X.min(),X.max()],title='Status Codes'),
                                                    yaxis=dict(range=[Y.min(),Y.max()],title='Count'),
                                                    title='Status graphing'
                                                   )
               } 

    except Exception as e:
        #File to capture exceptions
        with open('errors.txt','a') as f:
            f.write(str(e))
            f.write('\n')
            
@app.callback(Output('live-graph1', 'figure'),
              Input('graph-update', 'n_intervals')
             )
def update_graph_scatter1(n_intervals):
    try:
        processed_time = 0
        filter_condition = "CAST(status AS DECIMAL) IS NOT NULL and time >'"+str(processed_time)+"'"
        group_by = 'status'
        df = read_cassandra(filter_condition,group_by)
        processed_time = time.time()-60
        df.dropna(inplace=True)
        
        #Define X and Y axis values        
        X = df["status"]
        Y = df['count']
        
        #Scatter graph definition
        data = go.Pie(
                labels=X,
                values=Y
                )

        return {'data': [data]
               } 

    except Exception as e:
        #File to capture exceptions
        with open('errors.txt','a') as f:
            f.write(str(e))
            f.write('\n')


        

@app.callback(Output('top-paths-table', 'children'),
              Input('graph-update', 'n_intervals')
             )
def update_top_urls(n_intervals):
    try:
        processed_time = 0
        filter_condition = "time >'"+str(processed_time)+"'"
        group_by = 'path'
        df = read_cassandra(filter_condition,group_by,True)
        processed_time = time.time()-60

        df = df[['path','count']]

        return generate_table(df, max_rows=5)
    except Exception as e:
        #File to capture exceptions
        with open('table_errors.txt','a') as f:
            f.write(str(e))
            f.write('\n')

# Update the index
@app.callback(dash.dependencies.Output('page-content', 'children'),
              [dash.dependencies.Input('url', 'pathname')])
def display_page(pathname):
    if pathname == '/real-time':
        return realtime_dashboard
    elif pathname == '/hourly':
        return hourly_dashboard
    elif pathname == '/daily':
        return daily_dashboard
    else:
        return index_page

In [14]:
@app.callback(Output('hourly-graph', 'figure'),
              Input('hourly-graph-update', 'n_intervals')
             )
def update_hourly_scatter(n_intervals):
    try:
        group_by = "hour"
        time_format = 'yy-MM-dd-HH'
        df = unique_hosts(group_by,time_format)
        df.dropna(inplace=True)
        
        #Define X and Y axis values        
        X = df["hour"]
        Y = df['count']
        
        #Scatter graph definition
        data = go.Scatter(
                x=X,
                y=Y,
            #width=5
                    )

        return {'data': [data],'layout' : go.Layout(xaxis=dict(range=[X.min(),X.max()],title='Hour in a day (yy-MM-dd-HH)'),
                                                    yaxis=dict(range=[Y.min(),Y.max()],title='Number of Hits'),
                                                    title='Unique Requests Per Hour'
                                                   )
               } 

    except Exception as e:
        #File to capture exceptions
        with open('errors.txt','a') as f:
            f.write(str(e))
            f.write('\n')

In [15]:
@app.callback(Output('daily-graph', 'figure'),
              Input('daily-graph-update', 'n_intervals')
             )
def update_daily_scatter(n_intervals):
    try:
        group_by = "day"
        time_format = 'yy-MM-dd'
        df = unique_hosts(group_by,time_format)
        df.dropna(inplace=True)
        
        #Define X and Y axis values        
        X = df["day"]
        Y = df['count']
        
        #Scatter graph definition
        data = go.Scatter(
                x=X,
                y=Y
                )

        return {'data': [data],'layout' : go.Layout(xaxis=dict(range=[X.min(),X.max()],title='Day'),
                                                    yaxis=dict(range=[Y.min(),Y.max()],title='Count'),
                                                    title='Unique Request Per Day'
                                                   )
               } 

    except Exception as e:
        #File to capture exceptions
        with open('errors.txt','a') as f:
            f.write(str(e))
            f.write('\n')
            
@app.callback(Output('daily-graph1', 'figure'),
              Input('daily-graph-update', 'n_intervals')
             )
def update_daily_pie(n_intervals):
    try:
        group_by = "status"
        df = unique_hosts(group_by)
        df.dropna(inplace=True)
        
        #Define X and Y axis values        
        X = df["status"]
        Y = df['count']
        
        #Scatter graph definition
        data = go.Pie(
                labels=X,
                values=Y
                )

        return {'data': [data]
               } 

    except Exception as e:
        #File to capture exceptions
        with open('errors.txt','a') as f:
            f.write(str(e))
            f.write('\n')


In [None]:
if __name__ == '__main__':
    app.run_server(debug=False, use_reloader=False, port=8050,host= '0.0.0.0')

Dash is running on http://0.0.0.0:8050/

Dash is running on http://0.0.0.0:8050/

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on all addresses.
 * Running on http://172.20.0.4:8050/ (Press CTRL+C to quit)
196.89.62.246 - - [05/Sep/2023 07:55:40] "GET / HTTP/1.1" 200 -
196.89.62.246 - - [05/Sep/2023 07:55:40] "GET /_dash-component-suites/dash/deps/polyfill@7.v2_0_0m1637516547.12.1.min.js HTTP/1.1" 200 -
196.89.62.246 - - [05/Sep/2023 07:55:41] "GET /_dash-component-suites/dash/deps/react@16.v2_0_0m1637516547.14.0.min.js HTTP/1.1" 200 -
196.89.62.246 - - [05/Sep/2023 07:55:41] "GET /_dash-component-suites/dash/deps/react-dom@16.v2_0_0m1637516547.14.0.min.js HTTP/1.1" 200 -
196.89.62.246 - - [05/Sep/2023 07:55:41] "GET /_dash-component-suites/dash/deps/prop-types@15.v2_0_0m1637516547.7.2.min.js HTTP/1.1" 200 -
196.89.62.246 - - [05/Sep/2023 07:55:41] "GET /_dash-component-suites/dash/dash-renderer/build/dash_renderer.v2_0_0m1637516547.min.js HTTP/1.1" 200 -
196.89.62.246 - - [05/Sep/2023 07:55:41] "GET /_dash-component-suites/dash/dcc/dash_core_components.v2_0_0m1637516547.js HTTP/1.1" 200 -
196.89.62