In [2]:
from collections import Counter
from itertools import chain

import pandas as pd
import numpy as np
import sqlite3
import re

from bkcharts import Bar, cat
from bokeh.io import output_notebook, save
from bokeh.models import HoverTool
from bokeh.palettes import viridis
from bokeh.plotting import show
from scipy import stats

# import CleanData as cd
# import BokehPlots as bp
# import CoinagePlots as cp

In [3]:
title = 'Roman_Imperial_Coinage'
fname = '../Data/' + title + '.sqlite'
conn = sqlite3.connect(fname)

# Functions

## Get Coin Counts Data

In [28]:
def countsDF(source, column, conn):
    '''
    Get counts of the coins by emperor in SOURCE 
    '''
    query = '''
    SELECT
      emperor,
      {},
      count({}) AS counts
    FROM
      {} JOIN emperors ON
        startDate >= emperors.start
        AND startDate <= emperors.end
    GROUP BY emperor, {}
    ORDER BY start, end, counts DESC;
    '''.format(column, column, source, column)
    df = pd.read_sql(query, conn)
    return df

In [29]:
def radiateCountsDF(source, column, conn):
    '''
    Get dataframe of the counts for the occurence of 'radiate' crowns by emperor and denomination
    '''
    query = '''
    SELECT
      emperor,
      {},
      count({}) AS counts
    FROM
      {} JOIN emperors ON
        startDate >= emperors.start
        AND startDate <= emperors.end
    WHERE description LIKE '%radiate%'
    GROUP BY emperor, {}
    ORDER BY start, end, counts DESC;
    '''.format(column, column, source, column)
    df = pd.read_sql(query, conn)
    return df

In [30]:
def radiateRatiosDF(source, conn):
    '''
    Get dataframe of the ratio for the occurence of 'radiate' crowns by emperor 
    and denomination over total coins from emperor
    '''
    query = '''
    WITH emperorCoins AS (
      SELECT
        emperor,
        description,
        denomination,
        start,
        end
      FROM
        {} JOIN emperors ON
          startDate >= emperors.start
          AND startDate <= emperors.end
    ), emperorTotals AS (
      SELECT
        emperor,
        count(description)  AS emperorCounts
      FROM emperorCoins
      GROUP BY emperor
    )
    SELECT
      emperor,
      count(description) * 1.0 / emperorCounts   AS keywordRatio
    FROM
      emperorCoins JOIN emperorTotals USING (emperor)
    WHERE description LIKE '%radiate%'
    GROUP BY emperor
    ORDER BY start, end;
    '''.format(source)
    df = pd.read_sql(query, conn)
    return df

## Generate Coin Counts Plots

In [31]:
def countsPlot(source, column, conn):
    '''
    Create counts of the coins by emperor in SOURCE plot
    '''
    df = countsDF(source, column, conn)
    counts = Bar(df, label=cat(columns="emperor", sort=False), palette=viridis(df[column].unique().size), 
                    values='counts', stack=column, responsive=True, active_scroll='wheel_zoom', 
                     title="Count of coins by Emperor in {} Plot".format(source))

    hover_counts = HoverTool(tooltips=[
                                ("emperor", "@emperor"),
                                ("denomination", "@denomination"),
                                ("count", "@counts")
                            ])

    counts.add_tools(hover_counts)
    counts.xaxis.axis_label = "Emperors"
    counts.yaxis.axis_label = "Counts"

    save(counts, "../Plots/{}_counts_plot.html".format(source))
    
    return counts

In [32]:
def radiateCountsPlot(source, column, conn):
    '''
    Create plot of the count of coins that have 'radiate' crowns in them by emperor
    '''
    df = radiateCountsDF(source, column, conn)
    counts = Bar(df, label=cat(columns="emperor", sort=False), palette=viridis(df[column].unique().size), 
                    values='counts', stack=column, responsive=True, active_scroll='wheel_zoom', 
                     title="Count of 'Radiate Crowns' per Emperor in " + source + " Plot")

    hover_counts = HoverTool(tooltips=[
                                ("emperor", "@emperor"),
                                ("denomination", "@denomination"),
                                ("count", "@counts")
                            ])

    counts.add_tools(hover_counts)
    counts.xaxis.axis_label = "Emperors"
    counts.yaxis.axis_label = "Counts"

    save(counts, "../Plots/" + source + "_radiate_counts_plot.html")
    
    return counts

In [33]:
def radiateRatiosPlot(source, conn):
    '''
    Plot the ratio for the occurence of 'radiate' crowns by emperor 
    over total number of coins by emperor
    '''
    df = radiateRatiosDF(source, conn)
    ratios = Bar(df, label=cat(columns="emperor", sort=False), values='keywordRatio', 
                 responsive=True, active_scroll='wheel_zoom', legend=False,
                 title="Ratio of 'Radiate Crowns' by Emperor in " + source + " Plot")

    hover_counts = HoverTool(tooltips=[
                                ("emperor", "@emperor"),
                                ("Percentage", "@percentages")
                            ])

    ratios.add_tools(hover_counts)
    ratios.xaxis.axis_label = "Emperors"
    ratios.yaxis.axis_label = "Percentages"

    save(ratios, "../Plots/" + source + "_ratios_plot.html")
    
    return ratios

## Create Word Counts Plots

In [11]:
def wordCountsPlot(source, conn):
    '''
    Get the count of words that appear on "radiate" coins from SOURCE.
    '''
    query = '''
    SELECT
      obverseType,
      reverseType
    FROM {}
    WHERE description LIKE "%radiate%"
    '''.format(source)
    df = pd.read_sql(query, conn)
    
    obverseList = [re.sub('[^a-zA-Z\s]+', '', a).split() for a in df['obverseType']]
    reverseList = [re.sub('[^a-zA-Z\s]+', '', a).split() for a in df['reverseType']]
    
    obverseCounter = dict(Counter(chain.from_iterable(obverseList)))
    reverseCounter = dict(Counter(chain.from_iterable(reverseList)))
    
    obverseTemp = pd.DataFrame(list(obverseCounter.items()), columns=['Word', 'Count'])
    obverseTemp = obverseTemp[obverseTemp['Count'] > 100]
    obverseTemp['Side'] = 'Obverse'
    reverseTemp = pd.DataFrame(list(reverseCounter.items()), columns=['Word', 'Count'])
    reverseTemp = reverseTemp[reverseTemp['Count'] > 100]
    reverseTemp['Side'] = 'Reverse'
    
    df = pd.concat([obverseTemp, reverseTemp])
    df = df.fillna(0)
    print(df)
    
    wordCounts = Bar(df, label=cat(columns=["Word", 'Side'], sort=True), values='Count', 
                         responsive=True, active_scroll='wheel_zoom', legend=False, group='Side',
                         title="Temp")
    
    show(wordCounts)
    
wordCountsPlot('britishMuseum', conn)

          Word  Count     Side
19        bust    151  Obverse
55       right    725  Obverse
65     radiate    550  Obverse
71        Head    227  Obverse
73         and    396  Obverse
100         of    782  Obverse
101  cuirassed    365  Obverse
141     draped    335  Obverse
163       Bust    347  Obverse
205    Radiate    189  Obverse
208       left    108  Obverse
26     sceptre    139  Reverse
102   standing    442  Reverse
120         on    256  Reverse
324     seated    133  Reverse
329        and    567  Reverse
371       left    977  Reverse
527         in    727  Reverse
612      right    623  Reverse
657         of    129  Reverse
701       with    142  Reverse
714       hand    688  Reverse
734    holding    493  Reverse


# Create Counts and Ratios Plots

In [34]:
sources = ['britishMuseum', 'americanNumismaticSociety', 'OCRE', 'allData']
column = "material"
for source in sources:
    counts = countsPlot(source, column, conn)
    # show(counts)
    radiateCounts = radiateCountsPlot(source, column, conn)
    # show(radiateCounts)
    ratios = radiateRatiosPlot(source, conn)
    # show(ratios)



# Statistical Significance of Data

In [23]:
bm_ratio = radiateRatiosDF('britishMuseum', conn)
ans_ratio = radiateRatiosDF('americanNumismaticSociety', conn)
merged = bm_ratio.merge(ans_ratio, on='emperor')

In [24]:
merged.tail()

Unnamed: 0,emperor,keywordRatio_x,keywordRatio_y
49,Severus II,0.00813,0.008333
50,Maxentius,0.016327,0.011268
51,Constantine the Great,0.020356,0.037863
52,Licinius I,0.033259,0.067402
53,Maximinus II,0.032258,0.058824


In [25]:
stats.linregress(merged['keywordRatio_x'], merged['keywordRatio_y'])

LinregressResult(slope=0.90071854486013636, intercept=0.07711116939117077, rvalue=0.49712715812502589, pvalue=0.00013127840055810556, stderr=0.21801098570817246)