In [None]:
import dash
from dash import dcc 
from dash import html
from dash.dependencies import Input, Output
import dash.dependencies as dd
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import MetaData, Table
from sqlalchemy import select
from io import BytesIO
from wordcloud import WordCloud
import base64
import dash_bootstrap_components as dbc
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import plotly.graph_objects as go

### Load data from databse
If there's no connection to the database available, scroll down to find the code to load the data from csv files

In [None]:
df_sentiment = pd.read_csv('sentiment140_uncleaned.csv')

In [None]:
# load cleaned data from database
engine = create_engine('mysql+pymysql://root:{myPassword}@localhost:3306/{myDatabase}')
connection = engine.connect()

In [None]:
# cleaned sentiment140 dataset
metadata = MetaData()
twitter_data = Table('twitter_data', metadata, autoload=True, autoload_with = engine)
stmt = select([twitter_data])
result = connection.execute(stmt).fetchall()
# create new data frame from dataset
df_train_cleaned = pd.DataFrame(result)

In [None]:
# get custom dataset from database
metadata = MetaData()
custom_data = Table('custom_twitter_data', metadata, autoload=True, autoload_with = engine)
stmt = select([custom_data])
result = connection.execute(stmt).fetchall()
custom_test_df_cleaned = pd.DataFrame(result)

### Load data from csv-Files
Load the data from csv-Files, if there is no connection to the database

In [None]:
df_sentiment = pd.read_csv('sentiment140_uncleaned.csv')
df_train_cleaned = pd.read_csv('train_cleaned.csv')
custom_test_df_cleaned = pd.read_csv('customData_cleaned.csv')

### Create plots to show in dash
run all lines of code to be able to run the dash app later 

In [None]:
def get_words_frequency(corpus):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

In [None]:
df_train_cleaned_neg = df_train_cleaned[df_train_cleaned["polarity"] == 0]
df_train_cleaned_pos = df_train_cleaned[df_train_cleaned["polarity"] == 1]

In [None]:
pos_words_freq= get_words_frequency(df_train_cleaned_pos["text"])
neg_words_freq = get_words_frequency(df_train_cleaned_neg["text"])

In [None]:
top_50_pos_words = pos_words_freq[:50]
top_50_neg_words = neg_words_freq[:50]

In [None]:
df_top50_pos = pd.DataFrame(top_50_pos_words, columns =['Word', 'Count'])
df_top50_neg = pd.DataFrame(top_50_neg_words, columns =['Word', 'Count'])

In [None]:
# Get distribution of positive and negative tweets in train data set
df_train_cleaned_labeled = df_train_cleaned
df_train_cleaned_labeled['polarity'] = np.where(df_train_cleaned_labeled['polarity'] == 0, 'Negative tweet', df_train_cleaned_labeled['polarity'])
df_train_cleaned_labeled['polarity'] = np.where(df_train_cleaned_labeled['polarity'] == '1', 'Positive tweet', df_train_cleaned_labeled['polarity'])

In [None]:
# no need to run these figures to run the dash app

In [None]:
px.histogram(df_train_cleaned, x="polarity", color='polarity')

In [None]:
fig = px.histogram(df_top50_pos, x="Word" ,y = "Count",title="Distribution of the most frequent positive words", log_y=True)
fig.update_layout({'yaxis':{'title':{'text':'Frequency'}}})

In [None]:
fig = px.histogram(df_top50_neg, x="Word" ,y = "Count",title="Distribution of the most frequent negative words", log_y=True)
fig.update_layout({'yaxis':{'title':{'text':'Frequency'}}})

## Create wordcloud

In [None]:
df_pos = pd.DataFrame(pos_words_freq, columns =['Word', 'Count'])

In [None]:
data = dict(zip(df_pos['Word'].tolist(), df_pos['Count'].tolist()))
data = df_pos.set_index('Word').to_dict()['Count']

In [None]:
wc = WordCloud(width=800, height=400, max_words=200,background_color='white').generate_from_frequencies(data)
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.savefig(f'wordcloud_pos.png',
            dpi = 300)
plt.show()

In [None]:
df_neg = pd.DataFrame(neg_words_freq, columns =['Word', 'Count'])
data_neg = dict(zip(df_neg['Word'].tolist(), df_neg['Count'].tolist()))
data_neg = df_neg.set_index('Word').to_dict()['Count']
wc_neg = WordCloud(width=800, height=400, max_words=200,background_color='white').generate_from_frequencies(data_neg)
plt.figure(figsize=(10, 10))
plt.imshow(wc_neg, interpolation='bilinear')
plt.axis('off')
plt.savefig(f'wordcloud_neg.png',
            dpi = 300)
plt.show()

Run this code to create the roc curve and be able to start the dash app!

In [None]:
# compute roc curve
my_arrays = np.load("arrays_for_roc.npz")
fpr_lr = my_arrays["arr_0"]
tpr_lr= my_arrays["arr_1"]
thresholds= my_arrays["arr_2"]
roc_auc_score_lr= my_arrays["arr_3"][0]
fpr= my_arrays["arr_4"]
tpr= my_arrays["arr_5"]
thresholds= my_arrays["arr_6"]
roc_auc_score_nb= my_arrays["arr_7"][0]

In [None]:
roc_fig = go.Figure()
roc_fig.update_layout(title_text='ROC Comparison')
roc_fig.add_trace(go.Scatter(x=fpr,y=tpr,name=f'Naive Bayes ROC, AUC = {round(roc_auc_score_nb,2)}',mode='lines'))
roc_fig.add_trace(go.Scatter(x=fpr_lr,y=tpr_lr,name=f'Logistic Regression ROC, AUC = {round(roc_auc_score_lr,2)}',mode='lines+lines'))
roc_fig.add_trace(go.Scatter(x=[0,1],y=[0,1], name='k--', line=dict(color='seagreen', width=4, dash='dot'), mode='lines+lines+lines'))
roc_fig.update_layout(xaxis_title="False Positive Rate", yaxis_title="True Positive Rate")

In [None]:
from jupyter_dash import JupyterDash
app = JupyterDash(external_stylesheets=[dbc.themes.BOOTSTRAP], suppress_callback_exceptions=True)
app.title = "Sentiment140 dashboard"

# the style arguments for the sidebar. We use position:fixed and a fixed width
SIDEBAR_STYLE = {
    "position": "fixed",
    "top": 0,
    "left": 0,
    "bottom": 0,
    "width": "25rem",
    "padding": "2rem 1rem",
    "background-color": "#f8f9fa",
}

# the styles for the main content position it to the right of the sidebar and
# add some padding.
CONTENT_STYLE = {
    "margin-left": "25rem",
    "margin-right": "2rem",
}

sidebar = html.Div(
    [
        html.H2("Sentiment140 dataset", className="display-4"),
        html.Hr(),
        html.P(
            "Get some insights into Sentiment140's dataset.", className="lead"
        ),
        dbc.Nav(
            [
                dbc.NavLink("Dataset", href="/", active="exact"),
                dbc.NavLink("Word distribution", href="/page-1", active="exact"),
                dbc.NavLink("Evaluation", href="/page-2", active="exact"),
            ],
            vertical=True,
            pills=True,
        ),
    ],
    style=SIDEBAR_STYLE,
)

content = html.Div(id="page-content", style=CONTENT_STYLE)
firstPage= html.P(html.Div(className='first', 
                           children=[
                               html.Div(className='first_page', 
                                        children = [dcc.Graph(id='pos_neg_graph',figure =px.histogram(df_train_cleaned, x="polarity", color='polarity',title="Distribution of negative and positive tweets of Sentiment140's dataset"))])]))

distributionPage = html.P(html.Div(className='distr',children=[dcc.Dropdown(id="polarity-selector", options=[{'label': 'Positive', 'value': 'Positive'},{'label': 'Negative', 'value': 'Negative'}], value='Positive'),  html.Div(className='eight columns div-for-charts bg-grey',
                                           children = [dcc.Graph(id='graph'),
                                            html.H2("Wordcloud"),
                                            html.Img(id='image',alt='image', style={'width':'1100px', 'height':'auto'})]
                                          )
                               ]) 
                                  )

evaluationPage = html.P(html.Div(className='ev',
                                 children=[dcc.Graph(id='roc_graph',figure = roc_fig),
                                           html.Hr(),
                                           html.H3("Logistic Regression Model"),
                                           html.P(),
                                           html.Img(id='report_lr',alt='image', src="assets/ClassificationReport_LR.png"),
                                           html.Img(id='matrix_lr',alt='image', src="assets/ConfusionMatrix_LR.png", style={'width':'800px', 'height':'auto'}),
                                           html.Hr(),
                                           html.H3("Naive Bayes Model"),
                                           html.P(),
                                           html.Img(id='report_nb',alt='image', src="assets/ClassificationReport_NB.png"),
                                           html.Img(id='matrix_nb',alt='image', src="assets/ConfusionMatrix_NB.png", style={'width':'800px', 'height':'auto'}),
                                          ]))
app.layout = html.Div([dcc.Location(id="url"), sidebar, content])

@app.callback(Output("page-content", "children"), [Input("url", "pathname")])
def render_page_content(pathname):
    if pathname == "/":
        return firstPage;
    elif pathname == "/page-1":
        return distributionPage;
    elif pathname == "/page-2":
        return evaluationPage;
    # If the user tries to reach a different page, return a 404 message
    return html.Div(
        [
            html.H1("404: Not found", className="text-danger"),
            html.Hr(),
            html.P(f"The pathname {pathname} was not recognised..."),
        ],
        className="p-3 bg-light rounded-3",
    )

@app.callback(Output("image", "src"),Input("polarity-selector", "value") )
def make_figure(selected):
    if (selected=='Positive'):
        return "assets/wordcloud_pos.png"
    else: 
        return "assets/wordcloud_neg.png"

    

@app.callback(Output("graph", "figure"),Input("polarity-selector", "value") )
def make_figure(selected):
    if (selected=='Positive'):
        return px.histogram(
               df_top50_pos
               ,x="Word"
               ,y="Count"
               ,log_y=True
                ,title="Distribution of the top 50 most used words in positive tweets",
             labels={
                     "Word": "Word",
                     "sum of Count": "Frequency",
                 }
               )  
    else: 
        return px.histogram(
               df_top50_neg
               ,x="Word"
               ,y="Count"
               ,log_y=True
            ,title="Distribution of the top 50 most used words in negative tweets",
      labels={
                     "Word": "Word",
                     "sum of Count": "Frequency",
                 }
               )


app.run_server(mode='external', port = 8090, dev_tools_ui=True, dev_tools_hot_reload=True, threaded=True)

   