In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# dependencies
import pymongo as pm

In [3]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn)

# define db and collections
DB_NAME = 'scrape'
db = client[DB_NAME]

In [4]:
# this collection will store all the HTML pages
COL_NAME = 'msnbc'
htmlCol = db[COL_NAME]

In [6]:
# import db into local mongo instance
PATH = 'db/json/by_media/left/'
source = 'msnbc.json'

!mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+source} --batchSize 1

2018-06-16T21:33:37.706-0400	connected to: localhost
2018-06-16T21:33:40.690-0400	[################........] scrape.msnbc	338MB/485MB (69.8%)
2018-06-16T21:33:41.910-0400	[########################] scrape.msnbc	485MB/485MB (100.0%)
2018-06-16T21:33:41.911-0400	imported 1586 documents


In [46]:
def feature_extractor(html):
    '''
    Parse html using newspaper
    
    Parameters:
    --------
    html   : 'string'
    
    Returns:
    --------
    result : dictionary
    
    '''
    import newspaper
    from bs4 import BeautifulSoup as bs
    from datetime import datetime
    
    article = newspaper.Article('')
    article.set_html(html)
    article.build()
    
    # parse date manually if it wasn't found by newspaper
    if not article.publish_date:
        sp = bs(html, 'lxml')
        # 'dek___3AQpw' class appears on 30% of msnbc websites
        try:
            publish_date = datetime.strptime(\
                               (sp.find('p', class_='dek___3AQpw').span.text),\
                               '%b.%d.%Y'\
                            )
        except:
            publish_date = ''
    else:
        publish_date = article.publish_date


    return {
        'meta':{
            'date'    :publish_date,
            'title'   :article.title,
            'text'    :article.text,
            'authors' :article.authors,
            'keywords':article.keywords
        }
    }

In [71]:
def docs_parser(htmlCol):
    '''
    Parse mongo docs, extract features and update the doc with the features
    
    Parameters:
    --------
    htmlCol : mongodb collection, has to have documents with 'html' key
    
    Returns:
    --------
    updates all documents in the collection
    '''

    for doc in htmlCol.find():
        print(f"{doc['_id']}:")
        if 'meta' in doc:
            print(f"has meta")
        else:
            if 'html' in doc:
                htmlCol.update_one(
                    {'_id':ObjectId(doc['_id'])},
                    {'$set' : {
                              'meta' : feature_extractor(doc['html'])
                              }   
                    }
                )
                print(f"added meta")
            else:
                print(f"does not have html")
        print('----------')

In [85]:
def show_doc(id):
    '''
    Finds a document by 'id' and prints contents to the console
    
    Parameters:
    --------
    id : mongodb document id
    
    Returns:
    --------
    Prints first 100 symbols of each document's key to console
    '''
    from bson.objectid import ObjectId
    doc = htmlCol.find_one({'_id':ObjectId('5b243fbc897e82028b8ffe52')})
    for k in doc:
        print(f"{k} : {str(doc[k])[:100]}")

In [72]:
docs_parser(htmlCol)

5b243fb2897e82028b8ffe50:
has meta
----------
5b243fb7897e82028b8ffe51:
has meta
----------
5b243fbc897e82028b8ffe52:
has meta
----------
5b243fc1897e82028b8ffe53:
added meta
----------
5b243fc6897e82028b8ffe54:
added meta
----------
5b243fd0897e82028b8ffe55:
added meta
----------
5b243fd5897e82028b8ffe56:
added meta
----------
5b243fda897e82028b8ffe57:
added meta
----------
5b243ff1897e82028b8ffe5b:
added meta
----------
5b243fec897e82028b8ffe5a:
added meta
----------
5b243fe0897e82028b8ffe58:
added meta
----------
5b243fe7897e82028b8ffe59:
added meta
----------
5b244001897e82028b8ffe5e:
added meta
----------
5b243ff6897e82028b8ffe5c:
added meta
----------
5b243ffc897e82028b8ffe5d:
added meta
----------
5b244007897e82028b8ffe5f:
added meta
----------
5b244011897e82028b8ffe61:
added meta
----------
5b24401b897e82028b8ffe63:
added meta
----------
5b24400c897e82028b8ffe60:
added meta
----------
5b244025897e82028b8ffe65:
added meta
----------
5b244016897e82028b8ffe62:
added meta
---------

added meta
----------
5b2443ab897e82028b8ffef8:
added meta
----------
5b2443d0897e82028b8ffefe:
added meta
----------
5b2443ca897e82028b8ffefd:
added meta
----------
5b2443e7897e82028b8fff01:
added meta
----------
5b2443d5897e82028b8ffeff:
added meta
----------
5b2443e2897e82028b8fff00:
added meta
----------
5b2443ec897e82028b8fff02:
added meta
----------
5b2443fd897e82028b8fff05:
added meta
----------
5b244403897e82028b8fff06:
added meta
----------
5b2443f1897e82028b8fff03:
added meta
----------
5b2443f8897e82028b8fff04:
added meta
----------
5b244409897e82028b8fff07:
added meta
----------
5b244433897e82028b8fff0c:
added meta
----------
5b244438897e82028b8fff0d:
added meta
----------
5b244449897e82028b8fff10:
added meta
----------
5b24444e897e82028b8fff11:
added meta
----------
5b24440e897e82028b8fff08:
added meta
----------
5b24443d897e82028b8fff0e:
added meta
----------
5b24442d897e82028b8fff0b:
added meta
----------
5b244423897e82028b8fff09:
added meta
----------
5b244454897e82028b

added meta
----------
5b2447e2897e82028b8fffab:
added meta
----------
5b2447e7897e82028b8fffac:
added meta
----------
5b2447d6897e82028b8fffa9:
added meta
----------
5b2447ca897e82028b8fffa7:
added meta
----------
5b2447ed897e82028b8fffad:
added meta
----------
5b2447ff897e82028b8fffb0:
added meta
----------
5b2447f3897e82028b8fffae:
added meta
----------
5b2447f9897e82028b8fffaf:
added meta
----------
5b244804897e82028b8fffb1:
added meta
----------
5b244815897e82028b8fffb4:
added meta
----------
5b24481c897e82028b8fffb5:
added meta
----------
5b24480b897e82028b8fffb2:
added meta
----------
5b244810897e82028b8fffb3:
added meta
----------
5b24482e897e82028b8fffb8:
added meta
----------
5b244822897e82028b8fffb6:
added meta
----------
5b244848897e82028b8fffbc:
added meta
----------
5b244842897e82028b8fffbb:
added meta
----------
5b244828897e82028b8fffb7:
added meta
----------
5b24483b897e82028b8fffba:
added meta
----------
5b24484e897e82028b8fffbd:
added meta
----------
5b244834897e82028b

added meta
----------
5b244be5897e82028b900051:
added meta
----------
5b244c02897e82028b900056:
added meta
----------
5b244c14897e82028b900059:
added meta
----------
5b244c0e897e82028b900058:
added meta
----------
5b244bfc897e82028b900055:
added meta
----------
5b244c08897e82028b900057:
added meta
----------
5b244c1a897e82028b90005a:
added meta
----------
5b244c25897e82028b90005c:
added meta
----------
5b244c37897e82028b90005e:
added meta
----------
5b244c1f897e82028b90005b:
added meta
----------
5b244c2c897e82028b90005d:
added meta
----------
5b244c3d897e82028b90005f:
added meta
----------
5b244c43897e82028b900060:
added meta
----------
5b244c49897e82028b900061:
added meta
----------
5b244c5a897e82028b900064:
added meta
----------
5b244c55897e82028b900063:
added meta
----------
5b244c4e897e82028b900062:
added meta
----------
5b244c78897e82028b900069:
added meta
----------
5b244c66897e82028b900066:
added meta
----------
5b244c61897e82028b900065:
added meta
----------
5b244c72897e82028b

added meta
----------
5b244ff2897e82028b9000f9:
added meta
----------
5b245034897e82028b900103:
added meta
----------
5b24503a897e82028b900104:
added meta
----------
5b245040897e82028b900105:
added meta
----------
5b245046897e82028b900106:
added meta
----------
5b245028897e82028b900101:
added meta
----------
5b24504c897e82028b900107:
added meta
----------
5b245052897e82028b900108:
added meta
----------
5b24502f897e82028b900102:
added meta
----------
5b24505d897e82028b90010a:
added meta
----------
5b245069897e82028b90010c:
added meta
----------
5b245058897e82028b900109:
added meta
----------
5b245064897e82028b90010b:
added meta
----------
5b245070897e82028b90010d:
added meta
----------
5b245076897e82028b90010e:
added meta
----------
5b24507d897e82028b90010f:
added meta
----------
5b24508e897e82028b900112:
added meta
----------
5b245083897e82028b900110:
added meta
----------
5b245088897e82028b900111:
added meta
----------
5b24509b897e82028b900114:
added meta
----------
5b245096897e82028b

added meta
----------
5b245456897e82028b9001ad:
added meta
----------
5b245439897e82028b9001a8:
added meta
----------
5b24543e897e82028b9001a9:
added meta
----------
5b24546d897e82028b9001b1:
added meta
----------
5b245463897e82028b9001af:
added meta
----------
5b245468897e82028b9001b0:
added meta
----------
5b245473897e82028b9001b2:
added meta
----------
5b245478897e82028b9001b3:
added meta
----------
5b24545c897e82028b9001ae:
added meta
----------
5b24548b897e82028b9001b6:
added meta
----------
5b245490897e82028b9001b7:
added meta
----------
5b245496897e82028b9001b8:
added meta
----------
5b24549c897e82028b9001b9:
added meta
----------
5b24547f897e82028b9001b4:
added meta
----------
5b245485897e82028b9001b5:
added meta
----------
5b2454af897e82028b9001bc:
added meta
----------
5b2454a2897e82028b9001ba:
added meta
----------
5b2454a8897e82028b9001bb:
added meta
----------
5b2454c1897e82028b9001bf:
added meta
----------
5b2454b5897e82028b9001bd:
added meta
----------
5b2454cb897e82028b

added meta
----------
5b245893897e82028b900257:
added meta
----------
5b2458a5897e82028b90025a:
added meta
----------
5b2458ab897e82028b90025b:
added meta
----------
5b245899897e82028b900258:
added meta
----------
5b24589f897e82028b900259:
added meta
----------
5b2458c0897e82028b90025f:
added meta
----------
5b2458b0897e82028b90025c:
added meta
----------
5b2458c5897e82028b900260:
added meta
----------
5b2458db897e82028b900263:
added meta
----------
5b2458b6897e82028b90025d:
added meta
----------
5b2458bb897e82028b90025e:
added meta
----------
5b2458d1897e82028b900262:
added meta
----------
5b2458e1897e82028b900264:
added meta
----------
5b2458e6897e82028b900265:
added meta
----------
5b2458cb897e82028b900261:
added meta
----------
5b2458f4897e82028b900267:
added meta
----------
5b2458fa897e82028b900268:
added meta
----------
5b2458ee897e82028b900266:
added meta
----------
5b245901897e82028b900269:
added meta
----------
5b245908897e82028b90026a:
added meta
----------
5b245913897e82028b

added meta
----------
5b245cd2897e82028b900303:
added meta
----------
5b245cbd897e82028b900300:
added meta
----------
5b245cdf897e82028b900305:
added meta
----------
5b245ce5897e82028b900306:
added meta
----------
5b245cea897e82028b900307:
added meta
----------
5b245cd9897e82028b900304:
added meta
----------
5b245d2e897e82028b90030a:
added meta
----------
5b245d22897e82028b900308:
added meta
----------
5b245d28897e82028b900309:
added meta
----------
5b245d35897e82028b90030b:
added meta
----------
5b245d46897e82028b90030e:
added meta
----------
5b245d3a897e82028b90030c:
added meta
----------
5b245d54897e82028b900310:
added meta
----------
5b245d5a897e82028b900311:
added meta
----------
5b245d40897e82028b90030d:
added meta
----------
5b245d4d897e82028b90030f:
added meta
----------
5b245d5f897e82028b900312:
added meta
----------
5b245d71897e82028b900315:
added meta
----------
5b245d65897e82028b900313:
added meta
----------
5b245d7d897e82028b900317:
added meta
----------
5b245d83897e82028b

added meta
----------
5b246143897e82028b9003ad:
added meta
----------
5b24614e897e82028b9003af:
added meta
----------
5b246162897e82028b9003b2:
added meta
----------
5b246155897e82028b9003b0:
added meta
----------
5b24616d897e82028b9003b4:
added meta
----------
5b24615c897e82028b9003b1:
added meta
----------
5b24617b897e82028b9003b6:
added meta
----------
5b246180897e82028b9003b7:
added meta
----------
5b246168897e82028b9003b3:
added meta
----------
5b246186897e82028b9003b8:
added meta
----------
5b246175897e82028b9003b5:
added meta
----------
5b2461a2897e82028b9003bb:
added meta
----------
5b24618c897e82028b9003b9:
added meta
----------
5b24619c897e82028b9003ba:
added meta
----------
5b2461b6897e82028b9003be:
added meta
----------
5b2461bb897e82028b9003bf:
added meta
----------
5b2461aa897e82028b9003bc:
added meta
----------
5b2461c1897e82028b9003c0:
added meta
----------
5b2461cc897e82028b9003c2:
added meta
----------
5b2461d7897e82028b9003c4:
added meta
----------
5b2461d1897e82028b

added meta
----------
5b2465a0897e82028b90045a:
added meta
----------
5b2465a5897e82028b90045b:
added meta
----------
5b2465b1897e82028b90045d:
added meta
----------
5b2465b7897e82028b90045e:
added meta
----------
5b2465bc897e82028b90045f:
added meta
----------
5b2465ab897e82028b90045c:
added meta
----------
5b2465cc897e82028b900461:
added meta
----------
5b2465c6897e82028b900460:
added meta
----------
5b2465ea897e82028b900466:
added meta
----------
5b2465de897e82028b900464:
added meta
----------
5b2465f0897e82028b900467:
added meta
----------
5b2465e4897e82028b900465:
added meta
----------
5b2465d2897e82028b900462:
added meta
----------
5b2465fc897e82028b900469:
added meta
----------
5b2465d9897e82028b900463:
added meta
----------
5b246602897e82028b90046a:
added meta
----------
5b246608897e82028b90046b:
added meta
----------
5b2465f5897e82028b900468:
added meta
----------
5b24661e897e82028b90046e:
added meta
----------
5b246619897e82028b90046d:
added meta
----------
5b246624897e82028b

In [86]:
show_doc('5b243fbc897e82028b8ffe52')

_id : 5b243fbc897e82028b8ffe52
url : https://www.msnbc.com/all-in/watch/white-house-can-t-fill-open-positions-turns-to-job-fair-125617056
html : <!DOCTYPE html><html lang="en" data-reactroot=""><head><title data-rh="true">White House can&#x27;t 
meta : {'meta': {'date': datetime.datetime(2018, 6, 15, 0, 0), 'title': "White House can't fill open positi


In [None]:
# pymongo 'find' returns cursor that allows iterating through results
# calling first object [0] allows accessing the dictionary with results
# the ['html'] is the key in the dictionary
html = htmlCol.find({'url':'http://www.msnbc.com/velshi-ruhle/watch/jeff-sessions-is-justifying-harsh-immigration-policy-with-the-bible-1256689731629'},\
            projection={'html':True, '_id':False})[0]['html']