In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
import plotly.graph_objs as go
from datetime import datetime,timedelta

In [2]:
#Spark Session creation configured to interact with MongoDB
spark = SparkSession.builder.appName("pyspark-notebook").\
config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-avro_2.12:3.0.0,org.mongodb.spark:mongo-spark-connector_2.12:3.0.0").\
config("spark.mongodb.input.uri","mongodb://docker_mongo_1:27017/twitter_db.tweets").\
config("spark.mongodb.output.uri","mongodb://docker_mongo_1:27017/twitter_db.tweets").\
getOrCreate()

In [3]:
app = dash.Dash(__name__)

#Color assignment
colors = {
    'background': 'white',#'#0C0F0A',
    'text': '#FFFFFF'
}

def create_header(title):
    """Takes the input and Returns a html header

    Parameters
    ----------
    title : String
        Title of the Dashboard
        
    Returns
    ----------
        header: html header
    """
    
    header_style = {
        'background-color' : '#1B95E0',
        'padding' : '1.5rem',
        'color': 'white',
        'font-family': 'Verdana, Geneva, sans-serif'
    }
    header = html.Header(html.H1(children=title, style=header_style))
    return header

def generate_table(df, max_rows=10):
    """Takes pandas dataframe, optional max number of rows to display and returns html table

    Parameters
    ----------
    df : DataFrame
        Pandas dataframe
    max_rows: int
        Number of max rows to fit in a table
        
    Returns
    ----------
        table: html table
    """
    
    table = html.Table(className="responsive-table",
                      children=[
                          html.Thead(
                              html.Tr(
                                  children=[html.Th(col.title()) for col in df.columns.values]
                                  )
                              ),
                          html.Tbody(
                              [
                              html.Tr(
                                  children=[html.Td(data) for data in d]
                                  )
                               for d in df.values.tolist()])
                          ]
    )
    
    return table

#Layout definition - contains a header, input box to get search term, a graph and a table
app.layout = html.Div(style={'backgroundColor': colors['background']}, children=
    [   
        html.Div([create_header('Live Dashboard - Twitter Sentiment Analysis')]),
        html.Div(["Serch Term: ", dcc.Input(id='sentiment_term', value='twitter', type='text',placeholder='Enter word to be searched'),
                  dcc.Graph(id='live-graph', animate=False)
                 ]
                 ,style={'width': '64%', 'display': 'inline-block'}
                ),
        html.Div([html.H2("Recent Tweets"), 
                  html.Div(id="recent-tweets-table")]
                 ,style={'width': '34%', 'display': 'inline-block'}
                ),
        #Intervals define the frequency in which the html element should be updated
        dcc.Interval(id='graph-update',interval=1*1000, n_intervals=0),
        dcc.Interval(id='recent-table-update',interval=10*1000, n_intervals=0)
    ]
)

In [4]:
#Call back for live graph
@app.callback(Output('live-graph', 'figure'),
              Input('graph-update', 'n_intervals'),
              Input('sentiment_term', 'value')
             )
def update_graph_scatter(n_intervals,sentiment_term):
    """Takes interval and search term as inputs and returs live-graph

    Parameters
    ----------
    n_intervals : int
        Frequency to update figure
    sentiment_term: int
        Search term to analyse the sentiment
        
    Returns
    ----------
        graph: html graph
        live-graph
    """
    try:
        #Read data from MongoDB for last 200 seconds
        time_diff = (datetime.utcnow() - timedelta(seconds=200)).strftime('%Y-%m-%d %H:%M:%S')
        df = spark.read.format("mongo").load().select("timestamp_ms","text","prediction").where("timestamp_ms>'"+time_diff+"' and lower(text) like lower('%"+sentiment_term+"%')").toPandas()
        df.sort_values('timestamp_ms', inplace=True)
        df.dropna(inplace=True)
        
        #Define X and Y axis values        
        X = df["timestamp_ms"]
        Y = df['prediction']#[-100:]
        
        #Scatter graph definition
        data = go.Scatter(
                x=X,
                y=Y,
                name='Scatter',
                mode= 'lines+markers'
                )

        return {'data': [data],'layout' : go.Layout(xaxis=dict(range=[X.min(),X.max()]),
                                                    yaxis=dict(range=[0,1]),
                                                    title='Twitter Sentiment {}'.format(sentiment_term)
                                                   )
               } 

    except Exception as e:
        #File to capture exceptions
        with open('errors.txt','a') as f:
            f.write(str(e))
            f.write('\n')

#Call back for table to populate latest 10 tweets
@app.callback(Output('recent-tweets-table', 'children'),
              Input('recent-table-update', 'n_intervals'),
              Input('sentiment_term', 'value')
             )
def update_recent_tweets(n_intervals,sentiment_term):
    """Takes interval and search term as inputs and returs live-graph

    Parameters
    ----------
    n_intervals : int
        Frequency to update figure
    sentiment_term: int
        Search term to analyse the sentiment
        
    Returns
    ----------
        table: html graph
        table of latest 10 tweets
    """
    
    try:
        #Read data from MongoDB for last 200 seconds
        time_diff = (datetime.utcnow() - timedelta(seconds=200)).strftime('%Y-%m-%d %H:%M:%S')
        df = spark.read.format("mongo").load().select("timestamp_ms","text","prediction").where("timestamp_ms>'"+time_diff+"' and lower(text) like lower('%"+sentiment_term+"%')").limit(5).toPandas()
        df['sentiment'] = df['prediction']
        df['timestamp'] = df['timestamp_ms']
        df['tweet']     = df['text']

        df.drop(['timestamp_ms','text'],axis=1)

        df = df[['timestamp','tweet','sentiment']]

        return generate_table(df, max_rows=5)
    except Exception as e:
        #File to capture exceptions
        with open('table_errors.txt','a') as f:
            f.write(str(e))
            f.write('\n')

if __name__ == '__main__':
    app.run_server(debug=False, use_reloader=False, port=8050,host= '0.0.0.0')

Dash is running on http://0.0.0.0:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://0.0.0.0:8050/ (Press CTRL+C to quit)
172.21.0.1 - - [11/May/2021 17:51:34] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:34] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:34] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:34] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:34] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:34] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:35] "[37mGET / HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:36] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:36] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:36] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
172.21.0.1 - - [11/May/2021 17:51:36] "