In [1]:
import numpy as np
import time
import datetime
import json
import pandas as pd

import mysql.connector
from mysql.connector import Error

from bokeh.plotting import *
from bokeh.layouts import gridplot
from bokeh.models import ColumnDataSource, LogColorMapper, ColorMapper, LogTicker, ColorBar, BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter
from bokeh.models.widgets import Tabs, Panel
# from .chart_constants import (PLOT_FORMATS, ORANGE, BLUE, DARK_GRAY, AXIS_FORMATS, ORANGE_SHADOW, 
#                              FONT_PROPS_SM, FONT_PROPS_MD, FONT_PROPS_LG, GREEN)
from bokeh.io import show, output_notebook
import bokeh.palettes as bp

import urllib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)

In [2]:
json_url = urllib.request.urlopen('https://api.helioviewer.org/?action=getDataSources')
hv_keys = json.loads(json_url.read())

In [3]:
start_time=time.time()
try:
    connection = mysql.connector.connect(host='localhost',
                                         database='hv',
                                         user='hv_varun',
                                         password='Helioviewer@2020')

    sql_select_Query = "SELECT filepath, date, sourceid FROM data WHERE sourceId=%s;"%hv_keys['SDO']['AIA']['1600']['sourceId']
#     sql_select_Query = "SELECT count(*) FROM data WHERE filepath LIKE '/AIA/1600/%';"
#     sql_select_Query = "SELECT * FROM data LIMIT 20;"
    cursor = connection.cursor()
    cursor.execute(sql_select_Query)
    records = cursor.fetchall()
    print("Total number of rows in data is: ", cursor.rowcount)

    print("\nPrinting each laptop record")
#     for row in records:
#         print("Index = ", row[0])
#         print("Location = ", row[1])
#         print("Filename  = ", row[2])
#         print("OBS_DATE  = ", row[3], "\n")

except Error as e:
    print("Error reading data from MySQL table", e)
finally:
    if (connection.is_connected()):
        connection.close()
        cursor.close()
        print("MySQL connection is closed", time.time()-start_time )

Total number of rows in data is:  7056352

Printing each laptop record
MySQL connection is closed 192.19297242164612


In [4]:
hv = pd.DataFrame(records, columns=cursor.column_names)
hv = hv.sort_values('date').reset_index(drop=True)
hv

Unnamed: 0,filepath,date,sourceid
0,/AIA/1600/2010/06/02,2010-06-02 00:05:30,15
1,/AIA/1600/2010/06/02,2010-06-02 00:05:54,15
2,/AIA/1600/2010/06/02,2010-06-02 00:06:18,15
3,/AIA/1600/2010/06/23,2010-06-23 00:00:17,15
4,/AIA/1600/2010/06/23,2010-06-23 00:00:41,15
...,...,...,...
7056347,/AIA/2020/08/03/1600,2020-08-03 13:59:26,15
7056348,/AIA/2020/08/03/1600,2020-08-03 14:00:14,15
7056349,/AIA/2020/08/03/1600,2020-08-03 14:01:02,15
7056350,/AIA/2020/08/03/1600,2020-08-03 14:01:50,15


In [5]:
day_bins = pd.date_range(hv['date'].min().replace(day=1)-pd.DateOffset(days=1), hv['date'].max().replace(day=1)+pd.DateOffset(months=1), freq='D').to_period('D').to_timestamp()

In [6]:
counts, bins = np.histogram(hv['date'], bins = day_bins)#.size+1, range=(day_bins[0], day_bins[-1]))

In [106]:
counts[510]

184

In [107]:
np.argwhere(((counts>100)&(counts<300)))

array([[510],
       [672]])

In [119]:
def interactive_histogram(df,col,n_bins,title,x_axis_label,x_tooltip,log=None):
    """Plot interactive histogram using bokeh.
    
    df: pandas dataframe
    col: column of panda dataframe to plot (eg. age of users)
    n_bins: number of bins, e.g. 9
    bin_range: list with min and max value. e.g. [10,100] age of users.
    title: title of plot. e.g. 'Airnb Users Age Distribution'
    x_axis_label: x axis label. e.g. 'Age (years)'.
    x_tooltip: x axis tooltip string. e.g. 'Age'
    
    """
    import pandas as pd
    import numpy as np
    from bokeh.plotting import figure

    from bokeh.io import show, output_notebook
    from bokeh.models import ColumnDataSource, HoverTool, CategoricalColorMapper
    from bokeh.palettes import Category10_5, Category20_16

    arr_hist, edges = np.histogram(df,bins=n_bins)

    # Column data source
    arr_df = pd.DataFrame({'count': arr_hist, 'left': edges[:-1], 'right': edges[1:]})
    total = arr_df['count'].sum()
    arr_df['f_count'] = ['%d' % count for count in arr_df['count']]
    arr_df['f_percent'] = ['%.2f%%' %(count/total*100) for count in arr_df['count']]
    arr_df['f_interval'] = ['%d to %d ' % (left, right) for left, right in zip(arr_df['left'], arr_df['right'])]
    # column data source
    arr_src = ColumnDataSource(arr_df)

    # Set up the figure same as before
    panels = []
    for axis_type in ["linear", "log"]:
        p = figure(y_axis_type=axis_type,
                   title = title,
                   x_axis_label = x_axis_label, 
                   y_axis_label = 'Count',
                   background_fill_color="#fafafa")
        # Add a quad glyph with source this time
        p.quad(bottom=0.95, 
               top='count', 
               left='left', 
               right='right', 
               source=arr_src,
               fill_color='navy',
               alpha=0.5,
#                hover_fill_color='navy',
               hover_fill_alpha=0.2,
               line_color='white')

        # Add style to the plot
        p.title.align = 'center'
        p.title.text_font_size = '18pt'
        p.xaxis.axis_label_text_font_size = '12pt'
        p.xaxis.major_label_text_font_size = '12pt'
        p.yaxis.axis_label_text_font_size = '12pt'
        p.yaxis.major_label_text_font_size = '12pt'
        p.grid.grid_line_color="white"

        # Add a hover tool referring to the formatted columns
        hover = HoverTool(tooltips = [(x_tooltip, '@f_interval'),
                                      ('Count', '@f_count'),
                                      ('Percent', '@f_percent')])

        # Add the hover tool to the graph
        p.add_tools(hover)
        panel = Panel(child=p, title=axis_type)
        panels.append(panel)
    tabs = Tabs(tabs=panels)

    return tabs

output_file('AIA1600_histogram.html')

# Show the plot
df = counts
col = 'age'
n_bins = range(0,counts.max(),100)
title = 'Histogram for AIA 1600'
x_axis_label = 'No. of Data files'
x_tooltip = '#Data files'
btabs = interactive_histogram(df,col,n_bins,title,x_axis_label,x_tooltip,log='log')
show(btabs)