# Solr Query Stats

In [1]:
import requests
import pandas as pd

# Collections:
#SOLR_URL = "http://solr8.bapi.wa.bl.uk/solr/fc-2020-test/sql"
#COLLECTION = 'fc-2020-test'
SOLR_URL = "http://solr.api.wa.bl.uk/solr/all/sql"
COLLECTION = 'selective-20190221'

s = {
    # See https://lucene.apache.org/solr/guide/8_5/parallel-sql-interface.html#aggregation-modes
    'aggregationMode ': 'facet',
    # See https://lucene.apache.org/solr/guide/8_5/parallel-sql-interface.html
    'stmt': "SELECT content_type_norm, crawl_year, count(*) as total FROM `%s` GROUP BY content_type_norm, crawl_year HAVING count(*) > 2" % COLLECTION
}

r = requests.post(SOLR_URL, data=s)

print(r.text[0:1000])

data = r.json()
df = pd.DataFrame(data['result-set']['docs'][:-1])
df

{"result-set":{"docs":[{"content_type_norm":"audio","crawl_year":1995,"total":6},{"content_type_norm":"audio","crawl_year":2001,"total":26},{"content_type_norm":"audio","crawl_year":2004,"total":595},{"content_type_norm":"audio","crawl_year":2005,"total":5743},{"content_type_norm":"audio","crawl_year":2006,"total":8638},{"content_type_norm":"audio","crawl_year":2007,"total":11123},{"content_type_norm":"audio","crawl_year":2008,"total":43226},{"content_type_norm":"audio","crawl_year":2009,"total":65301},{"content_type_norm":"audio","crawl_year":2010,"total":67908},{"content_type_norm":"audio","crawl_year":2011,"total":73974},{"content_type_norm":"audio","crawl_year":2012,"total":106091},{"content_type_norm":"audio","crawl_year":2013,"total":59333},{"content_type_norm":"audio","crawl_year":2014,"total":53606},{"content_type_norm":"audio","crawl_year":2015,"total":34898},{"content_type_norm":"audio","crawl_year":2016,"total":9798},{"content_type_norm":"excel","crawl_year":2004,"total":8},

Unnamed: 0,content_type_norm,crawl_year,total
0,audio,1995,6
1,audio,2001,26
2,audio,2004,595
3,audio,2005,5743
4,audio,2006,8638
...,...,...,...
151,word,2012,450
152,word,2013,1929
153,word,2014,1058
154,word,2015,189


In [2]:
import altair as alt

alt.Chart(df).mark_bar().encode(
    x='crawl_year:N',
    y= alt.Y('total:Q', stack="normalize"),
    color='content_type_norm:N',
    tooltip=['content_type_norm', 'total', 'crawl_year']
)

In [101]:
import requests
import pandas as pd

# See https://lucene.apache.org/solr/guide/8_5/parallel-sql-interface.html
def run_solr_sql(stmt, 
                 solr_url = "http://solr.api.wa.bl.uk/solr/all/sql",
                 # See https://lucene.apache.org/solr/guide/8_5/parallel-sql-interface.html#aggregation-modes
                 aggregationMode = 'facet',
                 numWorkers = 8
                ):
    s = {
        'aggregationMode ': aggregationMode,
        'numWorkers': numWorkers,
        'stmt': stmt
    }

    df = None

    r = requests.post(SOLR_URL, data=s)
    if r.status_code != 200:
        print(r.text)
    else:
        data = r.json()
        if "EXCEPTION" in data['result-set']['docs'][0]:
            print(data['result-set']['docs'][0])
        print("RESPONSE_TIME [ms] = %i" % data['result-set']['docs'][-1]['RESPONSE_TIME'])
        df = pd.DataFrame(data['result-set']['docs'][:-1])
        
    return df


# Collections:
#SOLR_URL = "http://solr8.bapi.wa.bl.uk/solr/fc-2020-test/sql"
#COLLECTION = 'fc-2020-test'

collection = 'selective-20190221'
stmt_yrs_dmns_count = 'SELECT crawl_year, domain, count(*) as url_count FROM `%s` WHERE crawl_year > 0 AND crawl_year < 3000 AND url_path = \'/\' GROUP BY crawl_year, domain ORDER BY crawl_year, domain'
dff = run_solr_sql( stmt_yrs_dmns_count % collection )
dff

RESPONSE_TIME [ms] = 545


Unnamed: 0,crawl_year,domain,url_count
0,1995,portico.bl.uk,1
1,2001,bl.uk,2
2,2001,portico.bl.uk,1
3,2001,rcpch.ac.uk,1
4,2002,bl.uk,3
...,...,...,...
1245,2008,balance-riteosteopathy.co.uk,1
1246,2008,balancedscorecard.org,1
1247,2008,baliinstitute.org,1
1248,2008,balticmill.com,1


In [102]:
dff.groupby(['crawl_year']).count()

Unnamed: 0_level_0,domain,url_count
crawl_year,Unnamed: 1_level_1,Unnamed: 2_level_1
1995,1,1
2001,3,3
2002,1,1
2003,1,1
2004,1,1
2008,1243,1243


In [98]:
dfs = run_solr_sql( stmt_yrs_dmns_count % collection, aggregationMode='map_reduce' )
dfs

RESPONSE_TIME [ms] = 6994


Unnamed: 0,crawl_year,domain,url_count
0,1995,portico.bl.uk,1
1,2001,bl.uk,2
2,2001,portico.bl.uk,1
3,2001,rcpch.ac.uk,1
4,2002,bl.uk,3
...,...,...,...
78399,2016,yfc-ceredigion.org.uk,1
78400,2016,yfc-wales.org.uk,1
78401,2016,ylolfa.com,1
78402,2016,ymddiriedolaethjamespantyfedwen.org.uk,1


In [104]:
dfs.groupby(['crawl_year']).count()['domain'].reset_index()

Unnamed: 0,crawl_year,domain
0,1995,1
1,2001,3
2,2002,1
3,2003,1
4,2004,1
5,2008,17105
6,2009,23095
7,2010,18776
8,2011,4574
9,2012,5614


In [70]:
        #'stmt': "SELECT DISTINCT crawl_year, domain, count(*) as url_count FROM `%s` GROUP BY crawl_year, domain ORDER BY crawl_year" % COLLECTION
        #'stmt': "SELECT domain, count(*) FROM `%s` GROUP BY domain ORDER BY domain" % COLLECTION


In [100]:
import os
import json
import requests

proxies = {
  "http": None,
  "https": None,
}
# all, NPLD-DC2015-20190311 NPLD-DC2016-20190315

# This query breaks the queries up by crawl_year, and counts the unique hosts in each.
# It should be the most accurate approach, as it's a raw Streaming Expression.
# It limits the set to valid crawl_years, and selects home-pages only so it completes in a reasonable time.
q = {
    'expr': 'parallel(all, \
        rollup( \
            unique( \
                search(%s, \
                    q="crawl_year:[0 TO 3000] AND url_path:\"/\"", \
                    fl="domain,crawl_year", \
                    sort="crawl_year asc, domain asc", \
                    qt="/export", \
                    partitionKeys="crawl_year"), \
                over="crawl_year, domain"\
                ), \
            over="crawl_year",count(domain) \
        ), \
        workers="10", \
        sort="crawl_year asc")' % collection
}

r = requests.post(url='http://solr.api.wa.bl.uk/solr/all/stream', data=q, proxies=proxies)

# Pretty-print the results
df = None
if r.status_code != 200:
    print(r.text)
else:
    data = r.json()
    if "EXCEPTION" in data['result-set']['docs'][0]:
        print(data['result-set']['docs'][0])
    print("RESPONSE_TIME [ms] = %i" % data['result-set']['docs'][-1]['RESPONSE_TIME'])
    df = pd.DataFrame(data['result-set']['docs'][:-1])
df

RESPONSE_TIME [ms] = 1836


Unnamed: 0,count(domain),crawl_year
0,1,1995
1,3,2001
2,1,2002
3,1,2003
4,1,2004
5,17105,2008
6,23095,2009
7,18776,2010
8,4574,2011
9,5614,2012
