# Archived Holdings - HDFS Status

Breaking down what's stored on HDFS.

In [2]:
import os
import sys
sys.path.insert(0, os.path.abspath('..'))

In [3]:
import json
import requests
import pandas as pd

headers = {'content-type': "application/json" }

json_facet = {
    # Primary facet is by date - here we break down the last month(s) into days
    'facet': {
        'dates' : { 
            'type' : 'range', 
            'field' : 'timestamp_dt', 
            'start' : "NOW/YEAR-10YEAR",
            'end' : "NOW/YEAR+1YEAR", 
            'gap' : "+1MONTH", 
            # For each day, we facet based on the CDX Index field, and make sure items with no value get recorded:
            'facet': { 
                'collection': { 
                    'type': 'terms', 
                    "field": "collection_s", 
                    'missing': True,
                    'facet': { 
                        'stream': { 
                            'type': 'terms', 
                            "field": "stream_s", 
                            'missing': True,
                            'facet' : {
                                'bytes': 'sum(file_size_l)'
                            }
                        }
                    }
                }
            }
        } 
    }
}


params = {
  'q': '(kind_s:"warcs" OR kind_s:"logs")',
  'rows': 0
}

r = requests.post("http://solr8.api.wa.bl.uk/solr/tracking/select", params=params, data=json.dumps(json_facet), headers=headers)

if r.status_code != 200:
    print(r.text)

from solr.solr_facet_helper import flatten_solr_buckets

df = pd.DataFrame(flatten_solr_buckets(r.json()['facets']))
# Filter empty rows:
df=df[df['count'] != 0]

# Add compound column:
df['status'] = df.apply(lambda row: "%s, %s" % (row.collection, row.stream), axis=1)
df['terabytes'] = df.apply(lambda row: row.bytes / (1000*1000*1000*1000), axis=1)

df

Unnamed: 0,dates,collection,stream,count,bytes,status,terabytes
0,2013-01-01T00:00:00Z,selective,selective,7377,5.015786e+11,"selective, selective",0.501579
3,2013-02-01T00:00:00Z,selective,selective,8969,6.175074e+11,"selective, selective",0.617507
6,2013-03-01T00:00:00Z,selective,selective,16106,1.641558e+12,"selective, selective",1.641558
9,2013-04-01T00:00:00Z,npld,domain,20949,2.149665e+13,"npld, domain",21.496645
10,2013-04-01T00:00:00Z,npld,frequent,1301,1.233494e+12,"npld, frequent",1.233494
...,...,...,...,...,...,...,...
465,2022-09-01T00:00:00Z,npld,frequent,9133,9.049949e+12,"npld, frequent",9.049949
468,2022-10-01T00:00:00Z,npld,frequent,10162,9.986703e+12,"npld, frequent",9.986703
471,2022-11-01T00:00:00Z,npld,frequent,9596,9.518520e+12,"npld, frequent",9.518520
474,2022-12-01T00:00:00Z,npld,frequent,9862,9.730586e+12,"npld, frequent",9.730586


In [4]:
import altair as alt

alt.Chart(df).mark_bar().encode(
    x=alt.X('dates:T', axis = alt.Axis(title = 'Date', format = ("%b %Y"))),
    y=alt.Y('terabytes', axis=alt.Axis(title='Data volume (TB)')),
    color='status:N',
    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'terabytes']
).properties(width=600)

In [5]:
df2 = df.groupby(['status'])['terabytes'].sum().groupby(level=0).cumsum().reset_index()
df2

Unnamed: 0,status,terabytes
0,"npld, domain",476.085521
1,"npld, frequent",590.879437
2,"npld, webrecorder",0.003367
3,"selective, selective",10.736427


In [6]:
import altair as alt

alt.Chart(df).transform_window(
    cumulative_terabytes="sum(terabytes)",
).mark_area().encode(
    x=alt.X('dates:T', axis=alt.Axis(title='Date', format=("%b %Y"))),
    y=alt.Y('cumulative_terabytes:Q', axis=alt.Axis(title='Cumulative total data volume (TB)')),
    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'), 'cumulative_terabytes:Q']
).properties(width=600)

In [7]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('dates:T', axis = alt.Axis(title='Date', format=("%b %Y"))),
    y=alt.Y('count', stack="normalize", axis=alt.Axis(title='Percentage of files', format='%')),
    color='status:N',
    tooltip=[alt.Tooltip('dates:T', format='%A, %e %B %Y'),'status:N', 'count', 'bytes']
).properties(width=600)