# References

- [`ipywidgets` List](https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html)

- [Add custom interactivity with Jupyter Widgets](https://plotly.com/python/chart-events/)
    - Plotly FigureWidget Overview
    - Jupyter Lab with FigureWidget
    - Interactive data analysis with FigureWidget ipywidgets
    - Click events

In [2]:
MAX_POSTS = 10000

# Imports

In [3]:
from datetime import datetime
import io
import math
import re

import numpy as np
np.set_printoptions(precision=2)

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
pd.options.display.max_rows = 6
pd.set_option('display.precision', 2)

from ipywidgets import widgets
from IPython.display import display, clear_output, HTML, Markdown
def markdown(md): display(Markdown(md))

import plotly.express as px
import plotly.graph_objects as go

import plotly.io as pio
pio.templates.default = 'plotly_white'

# Load Data

In [4]:
try:
    pd.read_csv('data/vast challenge 2011/Microblogs.csv')
except FileNotFoundError:
    VASTOPOLIS_URL = 'https://drive.google.com/file/d/1Y5xWPRKk8D_TNYmZ0ny4cQ83shGM5YqI/view?usp=sharing'
    VASTOPOLIS_URL = 'https://drive.usercontent.google.com/download?id={}&export=download&authuser=0&confirm=t'.format(VASTOPOLIS_URL.split('/')[-2])

    Posts = pd.read_csv(VASTOPOLIS_URL, encoding='latin', on_bad_lines='skip')

# Convert, sort, and index the messages by the date/time of creation
Posts.Created_at = pd.to_datetime(Posts.Created_at, format='%m/%d/%Y %H:%M', errors='coerce')
Posts.dropna(inplace=True)  # Drop items that have N/A values
Posts.sort_values(by=['Created_at', 'ID'], inplace=True)
Posts.index = Posts.Created_at

# Extract the latitude and longitude from the Location column
Posts[['Latitude', 'Longitude']] = Posts.Location.str.split(' ', n=1, expand=True).astype('float')
Posts.drop('Location', axis='columns', inplace=True)
Posts.Longitude = -Posts.Longitude

# Create a grid of cells and assign each message to a cell
Bounds = dict(east=Posts.Longitude.max(), west=Posts.Longitude.min(), north=Posts.Latitude.max(), south=Posts.Latitude.min())

N_C = 12
N_R = int(N_C * (Bounds['north'] - Bounds['south']) / (Bounds['east'] - Bounds['west']))
Posts['R'] = ((Posts.Latitude - Bounds['south']) / (Bounds['north'] - Bounds['south']) * N_R).astype('int').astype('str')
Posts['C'] = ((Posts.Longitude - Bounds['west']) / (Bounds['east'] - Bounds['west']) * N_C).astype('int').map(lambda x: chr(ord('A') + x))

# Symptoms

Symptoms = 'fever chill sweat ache fatigue breath pain cough vomit nausea'.split(' ')

for symptom in Symptoms:
    Posts[symptom] = Posts.text.str.contains(symptom)
Posts['symptoms'] = Posts[Symptoms].any(axis='columns')
Posts['symptom'] = 'none'
for symptom in Symptoms:
    Posts.loc[Posts[symptom], 'symptom'] = symptom

# Functions

## `query_posts`

In [5]:
def query_posts(posts=Posts, start_date=None, end_date=None, area=None, symptoms=None, keywords=None, people=None):
    if start_date is not None:
        posts = posts.loc[start_date:end_date] if end_date is not None else posts.loc[start_date]

    q_area = None
    try:
        c1, r1 = re.match(r'([A-M])(\d+)', area.upper()).groups()
        q_area = f'("{r1}" == R) and ("{c1}" == C)'
    except: pass
    try:
        c1, r1, c2, r2 = re.match(r'([A-M])(\d+):([A-M])(\d+)', area.upper()).groups()
        q_area = f'("{r1}" <= R < "{r2}") and ("{c1}" <= C < "{c2}")'
    except: pass

    q_symptoms = None if len(symptoms) == 0 else f'({" | ".join(symptoms)})'

    q = list(filter(None, [q_area, q_symptoms]))
    if len(q) > 0:
        # print(q)
        posts = posts.query(' & '.join(q))

    if type(keywords) == list and len(keywords) > 0:
        pattern = re.compile('|'.join(map(re.escape, keywords)))
        # print(pattern)
        posts = posts[posts.text.str.contains(pattern)]
    
    if people is not None:    posts = posts[posts.ID.isin(people)]

    return posts.copy()

## `label_posts`

In [6]:
def label_posts_with_keywords(posts, keywords):
    posts['keyword'] = ''
    for kwd in keywords:
        posts.loc[posts.text.str.contains(kwd), 'keyword'] = kwd
    return posts

# TF-IDF

In [7]:
Vectorizer = TfidfVectorizer(min_df=10, max_df=0.5, stop_words='english')
Vectorizer.fit(Posts.text.values)
Words = Vectorizer.get_feature_names_out()    # The words in the vocabulary

def tfidf(documents):
    return Vectorizer.transform(documents)

CorpusNormalDays = ' '.join(Posts.loc[:'2011-05-15'].text.values)
normal_tfidf = tfidf([CorpusNormalDays])

## `topics`

In [8]:
def topics(posts, title=None, format='txt'):
    tfidf_values = tfidf([' '.join(posts.text.values)])
    notable_words = [(tfidf_values[0, i], Words[i]) for i in tfidf_values.nonzero()[1]
                    if tfidf_values[0, i] - normal_tfidf[0, i] > 0.01]
    notable_words = sorted(notable_words, reverse=True)

    if format == 'html':
        title = f'<h3>{title} ({len(posts)} messages)</h3>\n\n' if title else ''
        html = '<blockquote>' + ' '.join([f'<span style="font-size: {max(int(math.log(v * 200) * 6), 10)}px">{word}</span>' for v, word in notable_words]) + '</blockquote>'
        return title + html
    
    summary = ' '.join([word for v, word in notable_words])
    return summary if len(summary) < 200 else summary[:200]

## `sample_posts`

In [9]:
def sample_posts(posts, N=15):
    if len(posts) > N:
        posts = posts.sample(N).sort_index()
    html = '<ul>' + '\n\n'.join([f'<li>{t}: {text.text}</li>' for t, text in posts.iterrows()]) + '</ul>'
    return html

# Visualization

## `create_map`

In [10]:
WIDTH = 1600  # Adjust the width of the map to fit your display size
HEIGHT = int(WIDTH * (Bounds['north'] - Bounds['south']) / (Bounds['east'] - Bounds['west']))
IMAGE_URL = 'https://wakita.github.io/smartnova/assets/images/Vastopolis_Map.png'

MapTicktextX = [chr(ord('A') + n) for n in range(N_C + 1)]
MapTicktextY = list(range(N_R+1))
MapTickvalsX = np.linspace(Bounds['west'], Bounds['east'], N_C + 1).tolist()
MapTickvalsY = np.linspace(Bounds['south'], Bounds['north'], N_R + 1).tolist()

def create_map(posts, **extraargs):
    fig = px.scatter(posts, x='Longitude', y='Latitude', width=WIDTH, height=HEIGHT, hover_name='ID', hover_data=['Created_at', 'R', 'C', 'text'], **extraargs)
    fig.update_traces(marker={'size': 4})

    fig.add_layout_image(
            dict(
                source=IMAGE_URL,
                xref='x', yref='y', x=Bounds['west'], y=Bounds['north'],
                sizex=Bounds['east'] - Bounds['west'], sizey=Bounds['north'] - Bounds['south'],  sizing='stretch',
                layer='below', opacity=0.5))

    fig.update_xaxes(range=[Bounds['west'], Bounds['east']],
                     tickmode='array', ticktext=MapTicktextX, tickvals=MapTickvalsX)
    fig.update_yaxes(scaleanchor='x', scaleratio = 1,
                     range=[Bounds['south'], Bounds['north']],
                     tickmode='array', ticktext=MapTicktextY, tickvals=MapTickvalsY)

    return fig

## `trend`

In [11]:
def trend(posts):
    date = posts.iloc[0].Created_at.strftime('%Y-%m-%d')

    def topics(posts):
        tfidf_values = tfidf([' '.join(posts.text.values)])
        notable_words = [(tfidf_values[0, i], Words[i]) for i in tfidf_values.nonzero()[1]
                        if tfidf_values[0, i] - normal_tfidf[0, i] > 0.01]
        notable_words = sorted(notable_words, reverse=True)
        summary = ' '.join([word for v, word in notable_words])
        return summary if len(summary) < 200 else summary[:200]

    counts = posts.Created_at.dt.hour.value_counts().sort_index()
    Trend = pd.DataFrame(dict(hour=counts.index, count=counts.values,
                              topic=[topics(posts.loc[f'{date} {hr-1}:00':f'{date} {hr}:00'])
                                     for hr in counts.index]))
    #display(Trend)
    fig = px.bar(Trend, x='hour', y='count', labels={'x': 'Hour', 'y': '#Posts'}, custom_data=['topic'])
    fig.update_traces(hovertemplate='%{customdata[0]}')
    return fig

# User Interface

In [12]:
Dates = Posts.Created_at.dt.strftime('%m/%d').unique()
DatesSelector = widgets.SelectionRangeSlider(options=Dates, index=(0, len(Dates)-1), continuous_update=False, description='Date:', orientation='horizontal')
SymptomsSelector = widgets.SelectMultiple(options=Symptoms, value=[], description='Symptoms:', disabled=False)
AreaSelector = widgets.Text(value='', continuous_update=False, description='Area:', disabled=False, placeholder='a3 or a3:b5')
KeywordsInput = widgets.Text(value='', continuous_update=False, description='Keywords:', disabled=False)
PostersInput = widgets.Checkbox(value=False, description='Posters', disabled=False, tooltip='Focus on the posters of the saved queries')

InitialPosts = Posts.sample(MAX_POSTS).sort_index()

g_map = go.FigureWidget(create_map(InitialPosts, color='symptom'))

debug_output  = widgets.Output(layout={'border': '1px solid black'})

topics_output = widgets.HTML(layout={'border': '1px solid black'})

g_trend = go.FigureWidget(trend(InitialPosts))

Hours = [9, 17]
post_texts_header = '<h3>Sampled texts for the posts on the first day of the selection</h3><hr>'
post_texts = widgets.HTML(value=post_texts_header, layout={'border': '1px solid black'})
#HoursSelector = widgets.IntRangeSlider(value=Hours, min=0, max=23, step=1, description='Hour:', orientation='horizontal')

dashboard = widgets.VBox([DatesSelector, AreaSelector, SymptomsSelector, KeywordsInput, g_map, topics_output, g_trend, post_texts, debug_output])
with debug_output:
    print('Debug output:\n')

## Actions

In [13]:
query = { 'start_date': '2011-04-30', 'end_date': '2011-05-20', 'area': '', 'symptoms': [], 'keywords': [] }

def update_query(change):
    if change.owner == DatesSelector:
        query['start_date'] = datetime.strptime('2011/' + DatesSelector.value[0], '%Y/%m/%d').strftime('%Y-%m-%d')
        query['end_date']   = datetime.strptime('2011/' + DatesSelector.value[1], '%Y/%m/%d').strftime('%Y-%m-%d')
        Hours = [9, 17]

    if change.owner == AreaSelector:     query['area']     = AreaSelector.value
    if change.owner == SymptomsSelector: query['symptoms'] = list(SymptomsSelector.value)
    if change.owner == KeywordsInput:    query['keywords'] = KeywordsInput.value.split(' ')

    with debug_output:
        clear_output()
        display(query)
        posts = query_posts(**query)
        if len(posts) > MAX_POSTS: posts = posts.sample(MAX_POSTS).sort_index()
        
        topics_output.value = topics(posts, title='Topic keywords of the selection', format='html')
        post_texts.value = post_texts_header + sample_posts(posts)
        
        n_map = create_map(posts, color='symptom')
        n_trend = trend(posts)
        with g_map.batch_update():
            for i in range(len(n_map.data)):
                for k in 'x y hovertext customdata'.split(): g_map.data[i][k] = n_map.data[i][k]
            for i in range(len(n_map.data), len(g_map.data)):
                for k in 'x y hovertext customdata'.split(): g_map.data[i][k] = []
            for k in 'x y customdata'.split():               g_trend.data[0][k] = n_trend.data[0][k]

DatesSelector.observe   (update_query, names='value')
AreaSelector.observe    (update_query, names='value')
SymptomsSelector.observe(update_query, names='value')
KeywordsInput.observe   (update_query, names='value')

# def update_hours(change):
#     if change.owner == HoursSelector:
#         Hours = HoursSelector.value

# HoursSelector.observe(update_query, names='value')

from traitlets.utils.bunch import Bunch
update_query(Bunch(owner=None))

In [14]:
dashboard

VBox(children=(SelectionRangeSlider(continuous_update=False, description='Date:', index=(0, 20), options=('04/…

In [18]:
g_map.data

(Scattergl({
     'customdata': array([[Timestamp('2011-05-17 01:38:00'), '0', 'A',
                           'I wish Marisa Miller was my girlfriend.'],
                          [Timestamp('2011-05-17 01:51:00'), '0', 'A',
                           'Friendships Day is just one of those days where people choose to express what they feel about their friends throughout the year.......Ch'],
                          [Timestamp('2011-05-17 03:12:00'), '0', 'A', "I'm full smoke"],
                          ...,
                          [Timestamp('2011-05-17 20:58:00'), '0', 'A',
                           'My friend felt  the ground shake hope noone got hurt'],
                          [Timestamp('2011-05-17 22:33:00'), '0', 'A',
                           'The best enemy is a friend.'],
                          [Timestamp('2011-05-17 23:46:00'), '0', 'A',
                           "Margaret's boyfriend is makig her a samitch. Whiiiiiiiipppppeeeeeedd."]],
                         dt