Experimenting with Solr SQL
===========================

This notebook is for experimenting with Solr's Parallel SQL interface, especially via an SQLAlchemy plugin so that it's exactly like normal SQL.

Unfortunatley, current state of play is that Solr 6 does not cope with SQL queries on _aliases_, so it's not much use for text analysis right now.

The SQL system does work okay for Solr 8, but the `SELECT *` logic seems to be a bit brittle (at least via the SQLAlchemy module).  It works pretty reliably if the fields are explicitly enumerated.

In [1]:
!pip install sqlalchemy-solr



In [42]:
from sqlalchemy import create_engine

fc = 'NPLD-FC2017-20190228'

engine = create_engine('solr://solr.api.wa.bl.uk:80/solr/%s' % fc)

# Unfortunately, for Solr 6, we can't query aliases and all fields * leads to class cast exceptions!
# (java.lang.Long cannot be cast to java.lang.String)
rows = engine.execute("SELECT id,url,wayback_date FROM `%s` LIMIT 1" % fc)

for r in rows:
    for column, value in r.items():
        print(column, value)


************************************
Query: SELECT id,url,wayback_date FROM `NPLD-FC2017-20190228` LIMIT 1
************************************
id 20171225120530/r1upsuMttEfpjRI2R4rN7Q==
url http://www.newquayvoice.co.uk/news/5/article/2920/
wayback_date 20171225120530
************************************
Catched StopIteration in fetchone
************************************


In [21]:
#rows = engine.execute("SELECT id,url FROM `NPLD-FC2017-20190228` WHERE (host:'theguardian.com' OR host:'independent.co.uk' OR host:'dailymail.co.uk' OR host:'express.co.uk' OR host:'thesun.co.uk' OR host:'mirror.co.uk' OR host:'dailystar.co.uk') AND ((title:meghan AND title:harry) OR (title:meghan AND title:markle)) LIMIT 1")
rows = engine.execute("SELECT id,url,title,host FROM `NPLD-FC2017-20190228` WHERE host = '(theguardian.com independent.co.uk dailymail.co.uk express.co.uk thesun.co.uk mirror.co.uk dailystar.co.uk)' AND ((title = 'meghan' AND title = 'markle') OR (title = 'meghan' AND title = 'harry')) ORDER BY crawl_date LIMIT 1")

for r in rows:
    for column, value in r.items():
        print(column, value)


************************************
Query: SELECT id,url,title,host FROM `NPLD-FC2017-20190228` WHERE host = '(theguardian.com independent.co.uk dailymail.co.uk express.co.uk thesun.co.uk mirror.co.uk dailystar.co.uk)' AND ((title = 'meghan' AND title = 'markle') OR (title = 'meghan' AND title = 'harry')) ORDER BY crawl_date LIMIT 1
************************************
id 20170101102033/lhGw4C4wmG+0iFG0t7JrFw==
url http://www.dailymail.co.uk/tvshowbiz/article-3976578/amp/Meghan-Markle-enjoys-cocktails-Quantico-star-Priyanka-Chopra-Prince-Harry-tours-Caribbean.html
title Meghan Markle enjoys cocktails with Quantico's Priyanka Chopra while Prince Harry tours
host dailymail.co.uk
************************************
Catched StopIteration in fetchone
************************************


In [27]:
sql_df = pd.read_sql_query(
    "SELECT id,url,title,host,crawl_date FROM `NPLD-FC2017-20190228` WHERE host = '(theguardian.com independent.co.uk dailymail.co.uk express.co.uk thesun.co.uk mirror.co.uk dailystar.co.uk)' AND ((title = 'meghan' AND title = 'markle') OR (title = 'meghan' AND title = 'harry')) ORDER BY crawl_date LIMIT 100000",
    con=engine
)

sql_df

************************************
Query: SELECT id,url,title,host,crawl_date FROM `NPLD-FC2017-20190228` WHERE host = '(theguardian.com independent.co.uk dailymail.co.uk express.co.uk thesun.co.uk mirror.co.uk dailystar.co.uk)' AND ((title = 'meghan' AND title = 'markle') OR (title = 'meghan' AND title = 'harry')) ORDER BY crawl_date LIMIT 100000
************************************


Unnamed: 0,id,url,title,host,crawl_date
0,20170101102033/lhGw4C4wmG+0iFG0t7JrFw==,http://www.dailymail.co.uk/tvshowbiz/article-3...,Meghan Markle enjoys cocktails with Quantico's...,dailymail.co.uk,2017-01-01 10:20:33
1,20170101102049//M3a8PfNIu5zNH26wfStXA==,http://www.dailymail.co.uk/news/article-392542...,Prince Harry's girlfriend Meghan Markle spotte...,dailymail.co.uk,2017-01-01 10:20:49
2,20170101102057/Erluxdjjqr/jNL4hNzha4A==,http://www.dailymail.co.uk/news/article-402914...,Prince Harry and girlfriend Meghan Markle 'buy...,dailymail.co.uk,2017-01-01 10:20:57
3,20170101102101/z6CUHQC6PS+SWkzOnV6hLw==,http://www.dailymail.co.uk/news/article-402914...,Prince Harry and girlfriend Meghan Markle 'buy...,dailymail.co.uk,2017-01-01 10:21:01
4,20170101102123/Vs9UbyshtSk1D9ZNIyRFCA==,http://www.dailymail.co.uk/news/article-396227...,Prince Harry's girlfriend Meghan Markle says s...,dailymail.co.uk,2017-01-01 10:21:23
...,...,...,...,...,...
21051,20171227215813/rt9ttovXyXjBwGOzBKCWgg==,http://www.mirror.co.uk/news/uk-news/prince-ha...,Prince Harry WON'T take part in the traditiona...,mirror.co.uk,2017-12-27 21:58:13
21052,20171227215854/ZxKG1KVJAy95739ySnIbxg==,http://www.mirror.co.uk/3am/style/celebrity-fa...,"Meghan Markle is Hollywood perfection in £56,0...",mirror.co.uk,2017-12-27 21:58:54
21053,20171227231231/V2VEliskxTDJbIqlCcsF7w==,http://www.mirror.co.uk/3am/celebrity-news/por...,Porn searches for Meghan Markle go through the...,mirror.co.uk,2017-12-27 23:12:31
21054,20171228022515/n/RUSr7G6FbUZuSljxsDew==,http://www.mirror.co.uk/news/uk-news/meghan-ma...,Prince Harry reveals Meghan Markle's first Chr...,mirror.co.uk,2017-12-28 02:25:15


In [1]:
import sys
from sqlalchemy import create_engine
from sqlalchemy import inspect
import pandas as pd

pd.set_option('display.max_colwidth', -1)

def crawl_log_sql(sql_url='solr://dev1.n45.wa.bl.uk:8913/solr/crawl_log_fc'):
    return SolrSql(sql_url)


class SolrSql():
    
    def __init__(self, sql_url, fillna=''):
        self.sql_url = sql_url
        self.engine = create_engine(self.sql_url)
        self.fillna = fillna
        
    def columns(self):
        inspector = inspect(self.engine)

        cols = []
        for table_name in inspector.get_table_names():
            for column in inspector.get_columns(table_name):
                cols.append(column['name'])
                
        return cols

    def query(self, sql):
        try:
            sql_df = pd.read_sql_query(
                sql,
                con=self.engine
            )
        except Exception as e:
            print("%s: %s" % (e.__class__.__name__, e), file=sys.stderr)
            return None

        if self.fillna:
            sql_df.fillna(self.fillna, inplace=True)

        return sql_df

#crawl_log_sql("SELECT * FROM crawl_log_fc WHERE annotations = 'Q:serverMaxSuccessKb' ORDER BY log_timestamp DESC LIMIT 10")
logs = crawl_log_sql()
logs.query("SELECT * FROM crawl_log_fc WHERE mimetype LIKE 'image/*' ORDER BY `log_timestamp` DESC LIMIT 100")


************************************
Query: SELECT * FROM crawl_log_fc WHERE mimetype LIKE 'image/*' ORDER BY `log_timestamp` DESC LIMIT 100
************************************


Unnamed: 0,dol,hop_path,log_timestamp,status_code,warc_filename,warc_length,annotations,content_digest,source,via,...,start_time,crawler,size,_version_,launch_timestamp,warc_type,mimetype,wire_bytes,_query_,score
0,,RLLLLLLL,2021-02-04 13:57:49.531,-5003,,,Q:serverMaxSuccessKb,,tid:65152:https://londongreenleft.blogspot.co.uk/,https://londongreenleft.blogspot.com/2015/03/photos-peoples-climate-march-london.html?m=0,...,NaT,Heritrix,,1691350466971041795,NaT,,unknown,,,1
1,,LLLEL,2021-02-04 13:57:49.529,-5003,,,Q:serverMaxSuccessKb,,tid:129958:https://podcasts.ox.ac.uk/mark-carney-climate-change,https://podcasts.ox.ac.uk/node/62668,...,NaT,Heritrix,,1691350466971041794,NaT,,unknown,,,1
2,,,2021-02-04 13:57:49.433,200,BL-NPLD-WEBRENDER-frequent-npld-20210119131039-20210204135046812-03619-n4o6ljbu.warc.gz,2295.0,,sha1:LCJKSW7MZUANWXCSTVZWFNEQUZUQG6G6,,,...,2021-02-04 13:56:19.836,WebRender,,1691350466971041793,NaT,response,,1930.0,,1
3,,RLLLLLLE,2021-02-04 13:57:49.400,-5003,,,Q:serverMaxSuccessKb,,tid:65152:https://londongreenleft.blogspot.co.uk/,https://londongreenleft.blogspot.com/2015/03/photos-peoples-climate-march-london.html?m=0,...,NaT,Heritrix,,1691350466971041792,NaT,,unknown,,,1
4,,LE,2021-02-04 13:57:49.399,200,,,,sha1:IUPDL5ISAUDVS6CHS6OB2RNVMB5QIXMX,tid:117325:https://blogs.glowscotland.org.uk/gc/hillheadhigh/,https://blogs.glowscotland.org.uk/gc/hillheadhigh/parent-zone/advice-and-support/substance-misuse/,...,2021-02-04 13:57:49.336,Heritrix,27154.0,1691350466969993220,NaT,,text/css,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,LLLLEIIIX,2021-02-04 13:57:40.838,-5003,,,Q:serverMaxSuccessKb,,tid:109300:https://www.express.co.uk/latest/coronavirus/,https://www.express.co.uk/videos/201911.xml,...,NaT,Heritrix,,1691350466951118850,NaT,,unknown,,,1
96,,LLLLEIIIX,2021-02-04 13:57:40.729,-5003,,,Q:serverMaxSuccessKb,,tid:109300:https://www.express.co.uk/latest/coronavirus/,https://www.express.co.uk/videos/201911.xml,...,NaT,Heritrix,,1691350466951118849,NaT,,unknown,,,1
97,,RIII,2021-02-04 13:57:40.697,200,,,,sha1:QMCIHKXWBR4IOP3ER3HF5U6HGSAP2NM7,tid:62987:http://www.celebsnow.co.uk/,https://www.celebsnow.co.uk/sitemap.xml?yyyy=2016&mm=03,...,2021-02-04 13:57:40.483,Heritrix,64904.0,1691350466951118848,2016-03-15 15:30:01.200,,text/html,,,1
98,,LLLLLLR,2021-02-04 13:57:40.694,200,,,,sha1:5FPEUN3T5XNNHYIGEJPXWZDU2OV2KZE6,tid:109600:https://acvo.org.uk/about/covid-19/,https://www.nhs.uk/service-search/other-services/Disambiguation/ResultView?locationId=1632&locationName=Bentworth&entityId=341&serviceName=Eating-disorder-support&latitude=51.158&longitude=-1.051,...,2021-02-04 13:57:40.205,Heritrix,49678.0,1691350466950070276,NaT,,text/html,,,1


In [2]:
logs.query(
    "SELECT host, source, count(*) as urls \
    FROM crawl_log_fc \
    WHERE annotations = 'Q:serverMaxSuccessKb' \
    GROUP BY host, source \
    ORDER BY urls DESC \
    LIMIT 20"
)

************************************
Query: SELECT host, source, count(*) as urls     FROM crawl_log_fc     WHERE annotations = 'Q:serverMaxSuccessKb'     GROUP BY host, source     ORDER BY urls DESC     LIMIT 20
************************************


Unnamed: 0,host,source,urls
0,storage.googleapis.com,tid:95604:http://www.ercc.scot/,207396
1,cdn11.bigcommerce.com,tid:129806:https://www.scottsofstow.co.uk/,192127
2,www.legislation.gov.uk,tid:96456:http://ukscblog.com/,124498
3,i.dailymail.co.uk,tid:109175:https://www.dailymail.co.uk/news/coronavirus/index.html,123462
4,i2-prod.grimsbytelegraph.co.uk,tid:31421:http://www.grimsbytelegraph.co.uk/,122552
5,medium.com,tid:100481:https://medium.com/@ncb1947,121702
6,deriv.nls.uk,tid:34936:https://orhighlights.wordpress.com/,114637
7,thumbnailer.mixcloud.com,tid:97162:https://www.mixcloud.com/,92439
8,www.almos.org.uk,tid:16876:http://www.almos.org.uk/,85113
9,res.cloudinary.com,tid:95238:https://www.waitrose.com/,84687


In [5]:
tracking = SolrSql('solr://solr8.api.wa.bl.uk:80/solr/tracking')
print(tracking.columns())

sql_df = tracking.query("SELECT * FROM tracking WHERE kind_s = 'warcs' LIMIT 2")

sql_df

['_root_', '_version_', 'cdx_index_ss', 'cdx_records_checked_i', 'cdx_records_found_i', 'collection_s', 'file_ext_s', 'file_name_s', 'file_path_s', 'file_size_l', 'hdfs_group_s', 'hdfs_replicas_i', 'hdfs_user_s', 'id', 'job_s', 'kind_s', 'layout_s', 'modified_at_dt', 'permissions_s', 'recognised_b', 'refresh_date_dt', 'stream_s', 'timestamp_dt', 'year_i']
************************************
Query: SELECT * FROM tracking WHERE kind_s = 'warcs' LIMIT 2
************************************


Unnamed: 0,file_name_s,_root_,hdfs_user_s,stream_s,file_path_s,cdx_records_found_i,year_i,modified_at_dt,recognised_b,file_size_l,...,job_s,kind_s,cdx_index_ss,id,timestamp_dt,file_ext_s,permissions_s,refresh_date_dt,_query_,score
0,www.bl.uk-20150814093821.warc.gz,,hdfs,webrecorder,/1_data/npld/webrecorder/bl-your_stories/warcs/www.bl.uk-20150814093821.warc.gz,,2016,2016-12-30 11:59:00,True,12729735,...,bl-your_stories,warcs,[data-heritrix],hdfs://hdfs:54310/1_data/npld/webrecorder/bl-your_stories/warcs/www.bl.uk-20150814093821.warc.gz,2016-12-30 11:59:00,.bl.uk-20150814093821.warc.gz,-rw-r--r--,2021-02-11 08:17:08.836,,0
1,www.bl.uk-20150814094134.warc.gz,,hdfs,webrecorder,/1_data/npld/webrecorder/bl-your_stories/warcs/www.bl.uk-20150814094134.warc.gz,,2016,2016-12-30 11:59:00,True,95733865,...,bl-your_stories,warcs,[data-heritrix],hdfs://hdfs:54310/1_data/npld/webrecorder/bl-your_stories/warcs/www.bl.uk-20150814094134.warc.gz,2016-12-30 11:59:00,.bl.uk-20150814094134.warc.gz,-rw-r--r--,2021-02-11 08:17:08.836,,0
