# Interactive Graphs and Dashboards

This notebook will give you a short introduction on how to interact with plotly. Plotly is a python package that can be used to generate interactive graphs, which then can be embedded on a website, or you can create dashboards for your projects. This can either be done offline in the notebook or displayed on a personal page on plotly. 

If you want to disply your dahsboard publicly you can sign up with plotly https://plot.ly/. From your account settings you can retrieve an API key and then plot content of the notebook directly to your plotly site and use the grpahs to create a dashboard. 

In [1]:
import sqlalchemy
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
import math

import plotly.plotly as py
## Need this to connect to plotly (username and API key, important: keep your key private!)
py.sign_in('insert your username', 'insert your API key')  
import plotly.graph_objs as go

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True) 

In [3]:
# create engine
engine = create_engine('postgresql://stuffed/cusp')

# create query 
QUERY = '''
SELECT project_id, organization_state, organization_city, organization_country, fy, fy_total_cost, contact_pi_project_leader, other_pis
FROM wagner_ada_2018.projects 
WHERE (organization_country = 'UNITED STATES') 
'''

# save query in dataframe
prj=pd.read_sql_query(QUERY,con=engine)

# Let's look at the dataframe
prj.head()

Unnamed: 0,project_id,organization_state,organization_city,organization_country,fy,fy_total_cost,contact_pi_project_leader,other_pis
0,104368,NJ,PRINCETON,UNITED STATES,2009,350000.0,"CELIA, MICHAEL A",
1,104369,AZ,TUCSON,UNITED STATES,2009,29895.0,"TIEP, PHAM HUU",
2,104370,CA,STANFORD,UNITED STATES,2009,112840.0,"MIRZAKHANI, MARYAM",
3,104371,TX,ELPASO,UNITED STATES,2009,143815.0,"JOHNSON, JERRY","TWEEDIE, CRAIG; LOUGHEED, VANESSA; MACKAY, WIL..."
4,104372,TX,GALVESTON,UNITED STATES,2009,140927.0,"NIESEL, DAVID",


In [4]:
# Get number of researchers, which is the PI plus other PIs. Entries are separated by ; 
# so we can just calculate the number of entries in the string other pis and then add one for primary
prj['pi_count'] = prj['other_pis'].str.split(';').str.len() + 1

# If the count is missing it means that there are no other pis. In this case we wnat to replace that missing value 
# with 1 becasue we have one main pi on each grant
prj['pi_count'] = prj['pi_count'].fillna(1)

# Only projects that get more that have valid values
prj = prj[prj["fy_total_cost"] > 0]

# Just projects under a million 
prj = prj[prj["fy_total_cost"] < 1000000]

# Let's look at the dataframe
prj.head()

Unnamed: 0,project_id,organization_state,organization_city,organization_country,fy,fy_total_cost,contact_pi_project_leader,other_pis,pi_count
0,104368,NJ,PRINCETON,UNITED STATES,2009,350000.0,"CELIA, MICHAEL A",,1.0
1,104369,AZ,TUCSON,UNITED STATES,2009,29895.0,"TIEP, PHAM HUU",,1.0
2,104370,CA,STANFORD,UNITED STATES,2009,112840.0,"MIRZAKHANI, MARYAM",,1.0
3,104371,TX,ELPASO,UNITED STATES,2009,143815.0,"JOHNSON, JERRY","TWEEDIE, CRAIG; LOUGHEED, VANESSA; MACKAY, WIL...",4.0
4,104372,TX,GALVESTON,UNITED STATES,2009,140927.0,"NIESEL, DAVID",,1.0


## Scatterplot

In [5]:
# Generate Scatter plot
x = prj['fy_total_cost']
y = prj['pi_count']

# Create a trace
trace = go.Scattergl(
    x = x,
    y = y,
    mode = 'markers'
)

data = [trace]

# Plot and embed in ipython notebook!
py.iplot(data, filename='basic-scatter')

## Boxplot

In [6]:
# Let's plot from 2010 onwards
y2010 = prj[prj["fy"] == 2010]
y2011 = prj[prj["fy"] == 2011]
y2012 = prj[prj["fy"] == 2012]
y2013 = prj[prj["fy"] == 2013]
y2014 = prj[prj["fy"] == 2014]
y2015 = prj[prj["fy"] == 2015]

trace10 = go.Box(
    y=y2010['fy_total_cost'],
    name = '2010',
    marker = dict(
        color = 'rgb(255,192,203)',
    )
)
trace11 = go.Box(
    y=y2011['fy_total_cost'],
    name = '2011',
    marker = dict(
        color = 'rgb(219,112,147)',
    )
)
trace12 = go.Box(
    y=y2012['fy_total_cost'],
    name = '2012',
    marker = dict(
        color = 'rgb(176,48,96)',
    )
)
trace13 = go.Box(
    y=y2013['fy_total_cost'],
    name = '2013',
    marker = dict(
        color = 'rgb(186,85,211)',
    )
)
trace14 = go.Box(
    y=y2014['fy_total_cost'],
    name = '2014',
    marker = dict(
        color = 'rgb(208,32,144)',
    )
)
trace15 = go.Box(
    y=y2015['fy_total_cost'],
    name = '2015',
    marker = dict(
        color = 'rgb(238,130,238)',
    )
)


data = [trace10, trace11, trace12, trace13, trace14, trace15]
py.iplot(data, filename='basic-boxplot')


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




## United States Choropleth Map

In [7]:
subprj = prj[['organization_state','fy_total_cost', 'pi_count']]
subprj.head()

## Aggregate by state:
df = subprj.groupby('organization_state')['organization_state','fy_total_cost', 'pi_count'].mean().reset_index()
df.columns = ['state','funding', 'pi']
df.head()

Unnamed: 0,state,funding,pi
0,AK,270799.52263,1.339026
1,AL,279945.578196,1.269012
2,AR,284206.621364,1.383651
3,AS,210685.015267,1.068702
4,AZ,261432.844142,1.350157


In [8]:
for col in df.columns:
    df[col] = df[col].astype(str)

scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

df['text'] = df['state'] + '<br>' +\
    'Amount of Funding '+df['funding']+' Number of PIs '+df['pi']

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = df['state'],
        z = df['funding'].astype(float),
        locationmode = 'USA-states',
        text = df['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "USD")
        ) ]

layout = dict(
        title = '2010 US Federal Funding by State<br>(Hover for breakdown)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )

## Bubble Chart

In [9]:
data = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/gapminderDataFiveYear.csv")
data = data[data['year']==2007]
data.head()

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
11,Afghanistan,2007,31889923.0,Asia,43.828,974.580338
23,Albania,2007,3600523.0,Europe,76.423,5937.029526
35,Algeria,2007,33333216.0,Africa,72.301,6223.367465
47,Angola,2007,12420476.0,Africa,42.731,4797.231267
59,Argentina,2007,40301927.0,Americas,75.32,12779.37964


In [10]:
subprj = prj[prj['fy']==2010]
subprj = subprj[['organization_city','organization_state','fy_total_cost', 'pi_count', 'fy']]
subprj.head()

## Aggregate by state:
df = subprj.groupby(['organization_state','organization_city'])['organization_state','organization_city','fy_total_cost', 'pi_count', 'fy'].mean().reset_index()
df.columns = ['state','city','funding', 'pi', 'year']
df.head()

Unnamed: 0,state,city,funding,pi,year
0,AK,ANCHORAGE,349893.608696,1.478261,2010
1,AK,BARROW,483816.0,1.0,2010
2,AK,BETHEL,615000.0,1.0,2010
3,AK,CORDOVA,427962.0,1.0,2010
4,AK,EAGLE RIVER,50000.0,1.0,2010


In [11]:
## Now select 5 States we want to compare: New York, California, Michigan, Texas, Montana
newyork = df['state'] == "NY"
california = df['state'] == "CA"
michigan = df['state'] == "MI"
texas = df['state'] == "TX"
montana = df['state'] == "MT"
df = df[newyork | california | michigan | texas | montana]
df.head()

Unnamed: 0,state,city,funding,pi,year
62,CA,ALAMEDA,586711.666667,1.666667,2010
63,CA,ALBANY,580043.428571,2.0,2010
64,CA,APTOS,33987.0,3.0,2010
65,CA,ARCADIA,386112.888889,1.888889,2010
66,CA,ARCATA,259680.75,1.5,2010


In [12]:
# Add population size
population = [('TX', 28304596) ,
              ('MT', 1042520), 
              ('MI', 9962311),
              ('NY', 19849399),
              ('CA', 39536653)]
labels = ['state','pop']
df_pop = pd.DataFrame.from_records(population, columns=labels)

## Merge to df
df = pd.merge(df, df_pop, on='state')
df.head()

Unnamed: 0,state,city,funding,pi,year,pop
0,CA,ALAMEDA,586711.666667,1.666667,2010,39536653
1,CA,ALBANY,580043.428571,2.0,2010,39536653
2,CA,APTOS,33987.0,3.0,2010,39536653
3,CA,ARCADIA,386112.888889,1.888889,2010,39536653
4,CA,ARCATA,259680.75,1.5,2010,39536653


In [13]:
## Generate graph
df_2010 = df.sort_values(['state','city'])
slope = 2.666051223553066e-05
hover_text = []
bubble_size = []

for index, row in df_2010.iterrows():
    hover_text.append(('City: {city}<br>'+
                      'Number of PIs: {pi}<br>'+
                      'Amount of Funding: {funding}<br>'+
                      'Population: {pop}<br>'+
                      'Year: {year}').format(city=row['city'],
                                            pi=row['pi'],
                                            funding=row['funding'],
                                            pop=row['pop'],
                                            year=row['year']))
    bubble_size.append(math.sqrt(row['pop']*slope))

df_2010['text'] = hover_text
df_2010['size'] = bubble_size
sizeref = 2.*max(df_2010['size'])/(100**2)

trace0 = go.Scatter(
    x=df_2010['pi'][df_2010['state'] == 'CA'],
    y=df_2010['funding'][df_2010['state'] == 'CA'],
    mode='markers',
    name='CA',
    text=df_2010['text'][df_2010['state'] == 'CA'],
    marker=dict(
        symbol='circle',
        sizemode='area',
        sizeref=sizeref,
        size=df_2010['size'][df_2010['state'] == 'CA'],
        line=dict(
            width=2
        ),
    )
)
trace1 = go.Scatter(
    x=df_2010['pi'][df_2010['state'] == 'MI'],
    y=df_2010['funding'][df_2010['state'] == 'MI'],
    mode='markers',
    name='MI',
    text=df_2010['text'][df_2010['state'] == 'MI'],
    marker=dict(
        symbol='circle',
        sizemode='area',
        sizeref=sizeref,
        size=df_2010['size'][df_2010['state'] == 'MI'],
        line=dict(
            width=2
        ),
    )
)
trace2 = go.Scatter(
    x=df_2010['pi'][df_2010['state'] == 'MT'],
    y=df_2010['funding'][df_2010['state'] == 'MT'],
    mode='markers',
    name='MT',
    text=df_2010['text'][df_2010['state'] == 'MT'],
    marker=dict(
        symbol='circle',
        sizemode='area',
        sizeref=sizeref,
        size=df_2010['size'][df_2010['state'] == 'MT'],
        line=dict(
            width=2
        ),
    )
)
trace3 = go.Scatter(
    x=df_2010['pi'][df_2010['state'] == 'NY'],
    y=df_2010['funding'][df_2010['state'] == 'NY'],
    mode='markers',
    name='NY',
    text=df_2010['text'][df_2010['state'] == 'NY'],
    marker=dict(
        symbol='circle',
        sizemode='area',
        sizeref=sizeref,
        size=df_2010['size'][df_2010['state'] == 'NY'],
        line=dict(
            width=2
        ),
    )
)
trace4 = go.Scatter(
    x=df_2010['pi'][df_2010['state'] == 'TX'],
    y=df_2010['funding'][df_2010['state'] == 'TX'],
    mode='markers',
    name='TX',
    text=df_2010['text'][df_2010['state'] == 'TX'],
    marker=dict(
        symbol='circle',
        sizemode='area',
        sizeref=sizeref,
        size=df_2010['size'][df_2010['state'] == 'TX'],
        line=dict(
            width=2
        ),
    )
)

data = [trace0, trace1, trace2, trace3, trace4]
layout = go.Layout(
    title='Number of Investigators v. Federal Funding',
    xaxis=dict(
        title='Number of Investigators',
        gridcolor='rgb(255, 255, 255)',
        range=[1000, 1000000],
        autorange=True,
        autotick=True,
        ticks='',
    ),
    yaxis=dict(
        title='Funding Amount in USD',
        gridcolor='rgb(255, 255, 255)',
        autorange=True,
        autotick=True,
        ticks='',
    ),
    paper_bgcolor='rgb(243, 243, 243)',
    plot_bgcolor='rgb(243, 243, 243)',
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bubble-chart')