# Social Media Data Explorer

We can create an app that allows us to easily explore social media data.

## Prerequisites

### Imports

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, Layout, interactive_output
from IPython.display import display, HTML
from getpass import getpass
from matplotlib import pyplot as plt
import geopandas as gpd
import os
import matplotlib.colors as colors
import ipyleaflet
import matplotlib
import shapely

### Styling

In [2]:
display(HTML("<style>.container { width:80% !important; }</style>"))
pd.set_option('display.max_colwidth', -1)

## Data

In [3]:
df = pd.read_csv('HUMAN_public_social_media.csv')
df.set_index('uniqueid', inplace=True)
df = df.replace('[]', np.nan)

## Definitions

### Util

#### Pagination

In [4]:
def paginate_df(df, nb_items = 10):
    def show_df(df, page):
        display(df.head(page*nb_items).tail(nb_items))
        
    def show_next(change):
        pagination_slider.value += change
        
    def get_pagination_buttons():
        next_button = widgets.Button(layout=widgets.Layout(width='30px'), icon='chevron-right')
        next_button.on_click(lambda _: show_next(1))
        prev_button = widgets.Button(layout=widgets.Layout(width='30px'), icon='chevron-left')
        prev_button.on_click(lambda _: show_next(-1))
        return [prev_button, next_button]
    
    nb_rows = widgets.Label(value = '{} rows'.format(len(df)))
    
    if len(df) > nb_items:
        nb_pages = int(np.ceil(len(df)/nb_items))
        pagination_slider = widgets.IntSlider(value=1, min = 1, max = nb_pages, layout=Layout(width='60%'))
        pagination_slider_label = widgets.Label(value = 'of {} pages with '.format(nb_pages))

        pagination_controls = widgets.HBox([pagination_slider, pagination_slider_label, nb_rows, *get_pagination_buttons()])
        paginated_table = interactive_output(show_df, dict(df=fixed(df), page = pagination_slider))
        display(pagination_controls, paginated_table)
    else:
        display(nb_rows, df)

#### Plotting Columns

In [5]:
def plot_column_dist(df: pd.DataFrame, column:str, nb_to_plot = 10):
    def is_text(vals):
        if type(vals.dropna().values[0]) != str:
            return False
        
        mean_nb_tokens = vals.dropna().str.split(' ').apply(len).mean()
        return mean_nb_tokens > 3
    
    def is_numeric(vals):
        return all(vals.apply(type).unique() == float) and len(vals.unique()) > nb_to_plot and not any(vals.isna())
    
    vals = df[column]
    if len(vals.dropna()) == 0:
        plt.title('No values to plot'), plt.show()
        return
    
    if is_numeric(vals):
        sns.distplot(vals)
    else:
        if type(vals.dropna().values[0]) in (list, tuple, set) or is_text(vals):
            if is_text(vals):
                vals = vals.str.split(' ')
                vals.name = 'tokens'
            concat_series = pd.Series(np.concatenate(vals.dropna().values))
            counts = pd.DataFrame(concat_series.value_counts())
        else:
            counts = pd.DataFrame(vals.value_counts())
        counts.head(nb_to_plot).plot(kind='bar', label = vals.name)
    plt.tight_layout(), plt.grid(), plt.title(str(nb_to_plot) + ' most frequent ' + vals.name), plt.show()

#### Filtering DataFrames

In [6]:
def filter_df(df, order_by = 'uniqueid', ascending = True, required = [], search_column=None, search_term=''):
    df = df.sort_values(order_by, ascending=ascending)
    
    if len(required) > 0:
        df = df.dropna(subset=required)
    
    if search_term not in (None, '') and search_column in df.keys():
        df = df[df[search_column].astype(str).str.lower().str.contains(search_term.lower())]
    
    return df
    
def show_filtered_df(df, order_by = 'uniqueid', ascending = True, nb_items = 10, required = [], search_column=None, search_term=''):
    df = filter_df(df, order_by=order_by, ascending=ascending, required=required, search_column=search_column, search_term=search_term)
    paginate_df(df, nb_items=nb_items)

#### Time Sliders

In [7]:
def get_time_slider(df, timestamp_column = '_timestamp'):
    def get_date_range():
        timestamps = pd.to_datetime(df[timestamp_column]).apply(pd.Timestamp.date)
        vmin, vmax = timestamps.min(), timestamps.max()
        return pd.Series(pd.date_range(vmin, vmax)).apply(pd.Timestamp.date)

    date_range = get_date_range()
    return widgets.SelectionRangeSlider(options = date_range.values, description = 'Date Range', continuous_update=False,
                                        index = (0, len(date_range)-1), values=(0, len(date_range)),
                                       layout = Layout(width = '500px'))

### Table App

In [8]:
def table_app(df):
    def save_filtered_data(_):
        filtered_df = filter_df(df, order_by=order_by.value, ascending=ascending.value, required=filter_selector.value, 
                                search_column=search_column.value, search_term=search_term.value)
        filtered_df.to_csv(save_fn.value)
        save_status.value = '\t  sucessfully saved {} rows as {}.'.format(len(filtered_df), save_fn.value)
            
    def plot_filtered(df, required, search_column, search_term, plot_column):
        filtered_df = filter_df(df, required = required, search_column=search_column, search_term=search_term)
        plot_column_dist(df=filtered_df, column = plot_column)
        
    nb_items = widgets.Dropdown(options = [10, 20, 50], description = 'items per page', 
                                layout = Layout(width = '20%'))
    order_by = widgets.Dropdown(options = sorted(df.keys()), description='order by')
    ascending = widgets.ToggleButton(value=True, description = 'ascending')
    sorting = widgets.HBox([order_by, ascending, nb_items], layout=Layout(height = '50px'))

    filter_selector = widgets.SelectMultiple(options = sorted(df.keys()))
    
    filter_tip = widgets.VBox([widgets.HTML('Select multiple by dragging or ctrl + click'),
                               widgets.HTML('Deselect with ctrl + click')])
    filtering = widgets.HBox([filter_selector, filter_tip])

    save_button = widgets.Button(description='save')
    save_fn = widgets.Text('filtered_data.csv')
    save_button.on_click(save_filtered_data)
    save_status = widgets.Label()
    saving = widgets.HBox([save_fn, save_button, save_status])
    
    search_term = widgets.Text('', tooltip = 'Search')
    search_column = widgets.Dropdown(options = df.keys())
    plot_column = widgets.Dropdown(options = df.keys())
    
    column_dist = interactive_output(plot_filtered, dict(df=fixed(df), search_column = search_column, required = filter_selector, 
                                                         search_term=search_term, plot_column = plot_column))
    column_plot_box = widgets.VBox([widgets.Label('Plot Columns'), plot_column, column_dist])
    search_box = widgets.VBox([widgets.Label('Search Columns'), search_column, search_term])
    searching = widgets.TwoByTwoLayout(top_left = search_box, top_right = column_plot_box)
    widgets.dlink((search_column, 'value'), (plot_column, 'value'))

    accordion = widgets.Tab(children=[sorting, filtering, searching, saving])
    accordion.set_title(0, 'Sorting')
    accordion.set_title(1, 'Required Values')
    accordion.set_title(2, 'Searching')
    accordion.set_title(3, 'Save filtered Data')

    interactive_table = interactive_output(show_filtered_df, 
                                           dict(df=fixed(df), order_by = order_by, nb_items = nb_items, required = filter_selector, 
                                                ascending=ascending, search_column=search_column, search_term=search_term))
    display(widgets.VBox([accordion, interactive_table]))

### GeoVis Shapes

In [9]:
def get_nuts_shapes(shp_folder = 'nuts_data', simplify = False, tol = 1e-5):
    def get_fns(directory, condition = lambda x: True):
        return list(filter(condition, [directory + '/' + fn for fn in os.listdir(directory)]))

    folders = get_fns(shp_folder ,os.path.isdir)
    files = np.hstack([get_fns(folder, lambda f: f.endswith('.shp')) for folder in folders])
    geo_df = pd.concat(list(map(gpd.read_file, files)))
    if simplify:
        geo_df.geometry = geo_df.geometry.simplify(tol)
    return geo_df

def get_shapes_heatmap(data, nuts_ids_column, color_column, m, logarithmic:bool = False, cmap='viridis',
                      use_cholopleth = False, title_columns = ['NUTS_NAME', 'num_persons']):
    def get_layer(shapes: gpd.GeoDataFrame, color):
        def get_event_handler(location, text):
            def layer_event_handler(event, **kwargs):
                if event == 'click':
                    m.add_layer(ipyleaflet.Popup(location = location, child = widgets.HTML(text)))
            return layer_event_handler
        
        style={'color': color, 'fillColor': color, 'opacity':0.5, 'weight':1.9, 'dashArray':'2', 'fillOpacity':0.2}
        hover_style={'fillColor': 'blue' , 'fillOpacity': 0.2}
        layer = ipyleaflet.GeoData(geo_dataframe = shapes, style=style, hover_style = hover_style)
        text = '<br>'.join([str(shapes[k].values[0]) for k in title_columns])
        
        location = np.mean([shapes.geometry.centroid.y.values, shapes.geometry.centroid.x.values], 1)
        layer.on_click(get_event_handler(list(location), text))
        return layer

    def get_layer_group(shapes: gpd.GeoDataFrame, colors, group_name = ''):
        layers = []
        for i, color in enumerate(colors):
            layers.append(get_layer(shapes.iloc[[i]], color=color))
        return ipyleaflet.LayerGroup(layers=(layers), name=group_name)

    def get_colors(values:pd.Series, logarithmic:bool = False, cmap = 'viridis'):
        values = values + 1e-5
        norm_class = matplotlib.colors.LogNorm if logarithmic else matplotlib.colors.Normalize
        norm = norm_class(vmin=values.min(), vmax=values.max())
        cm = matplotlib.cm.get_cmap(cmap)
        return values.apply(norm).apply(cm).apply(matplotlib.colors.to_hex)
    
    def to_choropleth(df: gpd.GeoDataFrame, choro_column, geometry_column = 'geometry'):
        def to_geojson(vals: gpd.GeoSeries):
            return eval(vals.to_json())

        return ipyleaflet.Choropleth(geo_data = to_geojson(df[geometry_column]), choro_data = to_geojson(df[choro_column]),
                                    name=nuts_ids_column)
    
    if use_cholopleth:
        return to_choropleth(gpd.GeoDataFrame(data), choro_column = color_column, geometry_column='geometry')
    else:
        colors = get_colors(data[color_column], logarithmic=logarithmic, cmap=cmap)
        return get_layer_group(gpd.GeoDataFrame(data), colors=colors, group_name=nuts_ids_column)
    

def merge_df(data, nuts_shapes, nuts_ids_column, color_column):
    agg_data = data.groupby(nuts_ids_column).sum()
    merged_df = pd.merge(agg_data, nuts_shapes, left_on = nuts_ids_column, right_on = 'NUTS_ID')
    return merged_df.dropna(subset = ['NUTS_ID', color_column])
    

def plot_cbar(name, vmin = 0, vmax=1, logarithmic = False, n = 100):
    fig, ax = plt.subplots(figsize = (.3,14))
    norm = matplotlib.colors.LogNorm(vmin, vmax) if logarithmic else matplotlib.colors.Normalize(vmin, vmax)
    cbar = matplotlib.colorbar.ColorbarBase(ax, cmap=plt.get_cmap(name), norm=norm, orientation='vertical')
    return cbar
    
def plot_geo_data_shapes(data, nuts_shapes, date_range, nuts_ids_columns = ['origin', 'destination'], color_column = 'num_persons',
                 logarithmic = False, cmap='viridis', levels = [], timestamp_column = '_timestamp'):
    def get_geo_data(data, nuts_ids_column, m):
        merged_df = merge_df(data=data, nuts_shapes = nuts_shapes, nuts_ids_column=nuts_ids_column, color_column=color_column)
        return get_shapes_heatmap(merged_df, nuts_ids_column=nuts_ids_column, color_column=color_column,
                                  logarithmic=logarithmic, cmap=cmap, m=m)
    
    def date_filter(data, date_range):
        dates = pd.to_datetime(data[timestamp_column]).apply(pd.Timestamp.date)
        return data[(date_range[0] <= dates) & (dates <= date_range[1])]
    
    data = date_filter(data, date_range)
    m = ipyleaflet.Map(center=(51, 10), zoom=4)
    m.layout.height = '800px'
    for nuts_ids_column in nuts_ids_columns:
        layer = get_geo_data(data, nuts_ids_column, m)
        m.add_layer(layer)
    m.add_control(ipyleaflet.LayersControl())
    m.add_control(ipyleaflet.FullScreenControl())
    display(m)

In [10]:
def geo_vis_shapes_app(data):
    def get_cbar(name, logarithmic):
        color_vals = []
        for nuts_ids_column in nuts_ids_columns:
            color_vals.append(merge_df(data, nuts_shapes, 'origin', color_column)[color_column])
        vmin, vmax = max(np.min(color_vals), 1), np.max(color_vals)
        plot_cbar(name, vmin, vmax, logarithmic=logarithmic)
        
    nuts_ids_columns = ['origin', 'destination']
    color_column = 'num_persons'
    nuts_shapes = get_nuts_shapes(simplify=True, tol = 1e-2)
    avail_levels = sorted(nuts_shapes['LEVL_CODE'].unique())

    levels = widgets.Dropdown(options = [avail_levels, *[[l] for l in avail_levels]], description = 'NUTS levels')
    cmap = widgets.Dropdown(options = ['viridis', 'inferno', 'magma', 'winter', 'cool'], description = 'colormap')
    logarithmic = widgets.Checkbox(description='logarithmic')
    time_slider = get_time_slider(data)
    controls = widgets.VBox([widgets.HBox([levels, cmap, logarithmic]), time_slider])
    cbar = interactive_output(get_cbar, dict(name=cmap, logarithmic=logarithmic))
    
    geo_vis = interactive_output(plot_geo_data_shapes, dict(nuts_shapes = fixed(nuts_shapes), data = fixed(data), logarithmic=logarithmic, 
                                                     levels=levels, cmap=cmap, nuts_ids_columns=fixed(nuts_ids_columns),
                                                    date_range=time_slider))
    
    geo_vis.layout.width = '90%'
    geo_vis_box = widgets.HBox([geo_vis, cbar])
    display(controls, geo_vis_box)

### GeoVis Cluster

In [11]:
def get_marker_cluster(data, geom_column, title_columns = ['text_translated', '_timestamp']):
    def wkb_hex_to_point(s):
        return list(shapely.wkb.loads(s, hex=True).coords)[0][::-1]
    
    def get_title(d):
        return '\n'.join([str(d[c]) for c in title_columns if d[c] not in (np.nan, None)])
    
    data = data.dropna(subset=[geom_column])
    locs = data[geom_column].apply(wkb_hex_to_point)
    dicts = data.to_dict(orient='rows')
    
    markers = [ipyleaflet.Marker(location = loc, title=get_title(d), draggable=False) for loc, d in zip(locs, dicts)]
    return ipyleaflet.MarkerCluster(markers = markers)


def plot_geo_data_cluster(data, geom_column, timestamp_column, date_range, title_columns):
    def date_filter(data, date_range):
        dates = pd.to_datetime(data[timestamp_column]).apply(pd.Timestamp.date)
        return data[(date_range[0] <= dates) & (dates <= date_range[1])]
    
    data = date_filter(data, date_range)
    m = ipyleaflet.Map(center=(51, 10), zoom=4)
    m.layout.height = '800px'
    m.add_layer(get_marker_cluster(data, geom_column, title_columns=title_columns))
    m.add_control(ipyleaflet.FullScreenControl())
    display(m)

In [12]:
def geo_vis_cluster_app(data, timestamp_column = '_timestamp', geom_column = 'geom_tweet'):
    time_slider = get_time_slider(data)
    title_columns = widgets.SelectMultiple(options = sorted(data.columns), description = 'Information to show',
                                         value = ['text_translated', '_timestamp'])
    title_columns_tip = widgets.HTML('Select multiple by dragging or ctrl + click <br> Deselect with ctrl + click')
    title_columns_controls = widgets.HBox([title_columns, title_columns_tip])
    
    geo_vis = interactive_output(plot_geo_data_cluster, dict(data = fixed(data), date_range=time_slider, geom_column = fixed(geom_column),
                                                    timestamp_column=fixed(timestamp_column), title_columns=title_columns))
    
    geo_vis.layout.width = '90%'
    controls = widgets.Tab([time_slider, title_columns_controls])
    controls.set_title(0, 'Date Range')
    controls.set_title(1, 'Information to Show')
    display(controls, geo_vis)

## Table App

App for exploring raw data.

In [13]:
table_app(df)

VBox(children=(Tab(children=(HBox(children=(Dropdown(description='order by', options=('_timestamp', 'confidenc…

## GeoVis App Shapes

Origin and Destination in two layers. Aggregation via NUTS levels not implemented yet.

In [14]:
geo_vis_shapes_app(df)

VBox(children=(HBox(children=(Dropdown(description='NUTS levels', options=([0, 1, 2, 3], [0], [1], [2], [3]), …

HBox(children=(Output(layout=Layout(width='90%')), Output()))

## GeoVis App Clusters

Clustered single tweets, tooltips show information about tweet (customizable below).

In [15]:
geo_vis_cluster_app(df)

Tab(children=(SelectionRangeSlider(continuous_update=False, description='Date Range', index=(0, 244), layout=L…

Output(layout=Layout(width='90%'))