In [156]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [157]:
# dependencies
import pymongo as pm

In [181]:
def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [160]:
def feature_extractor(html):
    '''
    Parse html using newspaper
    
    Parameters:
    --------
    html   : 'string'
    
    Returns:
    --------
    result : dictionary
    
    '''
    import newspaper
    from bs4 import BeautifulSoup as bs
    from datetime import datetime
    
    article = newspaper.Article('')
    article.set_html(html)
    article.build()
    
    # parse date manually if it wasn't found by newspaper
    if not article.publish_date:
        sp = bs(html, 'lxml')
        # 'dek___3AQpw' class appears on 30% of msnbc websites
        try:
            publish_date = datetime.strptime(\
                               (sp.find('p', class_='dek___3AQpw').span.text),\
                               '%b.%d.%Y'\
                            )
        except:
            publish_date = ''
    else:
        publish_date = article.publish_date


    return {
            'date'    :publish_date,
            'title'   :article.title,
            'text'    :article.text,
            'authors' :article.authors,
            'keywords':article.keywords
    }

In [175]:
def docs_parser(htmlCol):
    '''
    Parse mongo docs, extract features and update the doc with the features
    
    Parameters:
    --------
    htmlCol : mongodb collection, has to have documents with 'html' key
    
    Returns:
    --------
    updates all documents in the collection
    '''

    for doc in htmlCol.find():
        print(f"{doc['_id']}:")
            
        if 'html' in doc:
            # extract metadata from html
            meta = feature_extractor(doc['html'])
            
            try:
                if meta == doc['meta']:
                    print(f"has same meta")
            except:
                # if there is newer meta data or meta key is not existing
                htmlCol.update_one(
                    {'_id':ObjectId(doc['_id'])},
                    {'$set' : {
                              'meta' : meta
                              }
                    }
                )
                print(f"updated meta")
        else:
            print(f"does not have html")
        print('----------')

In [162]:
def show_doc(id):
    '''
    Finds a document by 'id' and prints contents to the console
    
    Parameters:
    --------
    id : mongodb document id
    
    Returns:
    --------
    Prints first 100 symbols of each document's key to console
    '''
    from bson.objectid import ObjectId
    doc = htmlCol.find_one({'_id':ObjectId('5b243fbc897e82028b8ffe52')})
    for k in doc:
        print(f"{k} : {str(doc[k])[:100]}")

Some comands to keep dbs clean

In [98]:
# deletes all 'meta' fields from all docs
# htmlCol.update({}, {$unset: {meta:1}}, false, true); # mongo shell comand
htmlCol.update({}, {'$unset': {'meta':1}}, multi=True) # pymongo way


  This is separate from the ipykernel package so we can avoid doing imports until


{'n': 1586, 'nModified': 1586, 'ok': 1.0, 'updatedExisting': True}

In [105]:
# leaves only unique documents by 'url' field

htmlCol.create_index(
    "url",
    unique=True
)

'url_1'

In [None]:
# pymongo 'find' returns cursor that allows iterating through results
# calling first object [0] allows accessing the dictionary with results
# the ['html'] is the key in the dictionary
html = htmlCol.find({'url':'http://www.msnbc.com/velshi-ruhle/watch/jeff-sessions-is-justifying-harsh-immigration-policy-with-the-bible-1256689731629'},\
            projection={'html':True, '_id':False})[0]['html']

In [127]:
# find documents NOT containing a 'tag': regex expression
import re
tag = re.compile('dek___3AQpw.')
docs = htmlCol.find({"html" : {'$not': tag}})
for d in docs[:20]: print(d['url'])


http://www.msnbc.com/
http://www.msnbc.com/{{path.prefix}}/transcripts
http://www.msnbc.com/home2?page=1
http://www.msnbc.com/rachel-maddow-show/trump-jr-offers-the-wrong-response-concerns-the-gop-cult
http://www.msnbc.com/explore
http://www.msnbc.com/rachel-maddow-show/president-tries-brush-lying-about-infamous-trump-tower-meeting
http://www.msnbc.com/#main-menu
http://www.msnbc.com/{{path.prefix}}/transcripts#main-menu
https://www.facebook.com/sharer/sharer.php?u=https://www.msnbc.com/the-last-word/watch/lawrence-trump-tries-to-steal-the-grief-of-fallen-soldiers-parents-1256235075972
http://www.msnbc.com/rachel-maddow-show/the-good-question-team-trump-considers-ridiculous-and-ludicrous
http://www.msnbc.com/guns
https://twitter.com/intent/tweet?text=Trump%20calls%20IG%20report%20'horror%20show'%2C%20blames%20Dems%20for%20separations&via=msnbc&url=https://www.msnbc.com/morning-joe/watch/trump-calls-ig-report-horror-show-blames-dems-for-separations-1256477251672&original_referer=https:/

In [155]:
??test

In [None]:
test = newspaper.Article(url='http://www.msnbc.com/rachel-maddow-show/defamation-lawsuit-grows-more-serious-trump')
test.build()

In [146]:
print(test.publish_date)

None


In [150]:
test.date

AttributeError: 'Article' object has no attribute 'date'

In [147]:
test.meta_data

defaultdict(dict,
            {'og': {'url': 'http://www.msnbc.com/rachel-maddow-show/defamation-lawsuit-grows-more-serious-trump',
              'type': 'article',
              'site_name': 'MSNBC',
              'title': 'Defamation lawsuit grows more serious for Trump',
              'description': "As if Donald Trump's lawyers weren't already busy, Summer Zervos' civil suit creates a real threat -- which isn't going away.",
              'image': 'http://www.msnbc.com/sites/msnbc/files/styles/ratio--1_91-1--1200x630/public/06525535.jpg?itok=lrFK_ych'},
             'twitter': {'card': 'summary_large_image',
              'site': '@msnbc',
              'creator': {'identifier': '@msnbc', 'id': 2836421},
              'url': 'http://www.msnbc.com/rachel-maddow-show/defamation-lawsuit-grows-more-serious-trump',
              'account_id': 2836421,
              'domain': 'msnbc.com',
              'title': 'Defamation lawsuit grows more serious for Trump',
              'description

# **Production code**

In [193]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn)

# define db 
DB_NAME = 'scrape'
db = client[DB_NAME]

In [196]:
# import json files into db
left_path = 'db/json/by_media/left/' 
lefts = !ls {left_path}

for file in lefts: 
    print(file)
    COL_NAME='left'
    addToDB(DB_NAME, COL_NAME, left_path, file)

NR_DailyB_Intercept.json
2018-06-26T21:40:06.483-0400	connected to: localhost
2018-06-26T21:40:09.466-0400	[#########...............] scrape.left	317MB/828MB (38.3%)
2018-06-26T21:40:12.466-0400	[###################.....] scrape.left	664MB/828MB (80.2%)
2018-06-26T21:40:13.946-0400	[########################] scrape.left	828MB/828MB (100.0%)
2018-06-26T21:40:13.946-0400	imported 4768 documents
Collection left in scrape database created
bbc.json
2018-06-26T21:40:14.123-0400	connected to: localhost
2018-06-26T21:40:17.112-0400	[#####...................] scrape.left	358MB/1.66GB (21.1%)
2018-06-26T21:40:20.113-0400	[#########...............] scrape.left	685MB/1.66GB (40.4%)
2018-06-26T21:40:23.110-0400	[##############..........] scrape.left	1.01GB/1.66GB (61.0%)
2018-06-26T21:40:26.107-0400	[###################.....] scrape.left	1.33GB/1.66GB (80.3%)
2018-06-26T21:40:29.062-0400	[########################] scrape.left	1.66GB/1.66GB (100.0%)
2018-06-26T21:40:29.062-0400	imported 7921 documen

In [None]:
# import json files into db
right_path = 'db/json/by_media/left/' 
rights = !ls {right_path}

for file in rights: 
    print(file)
    COL_NAME='right'
    addToDB(DB_NAME, COL_NAME, right_path, file)

NR_DailyB_Intercept.json
2018-06-26T21:42:07.096-0400	connected to: localhost
2018-06-26T21:42:10.083-0400	[########................] scrape.right	292MB/828MB (35.2%)
2018-06-26T21:42:13.079-0400	[###############.........] scrape.right	548MB/828MB (66.2%)
2018-06-26T21:42:16.079-0400	[######################..] scrape.right	774MB/828MB (93.5%)
2018-06-26T21:42:16.807-0400	[########################] scrape.right	828MB/828MB (100.0%)
2018-06-26T21:42:16.807-0400	imported 4768 documents
Collection right in scrape database created
bbc.json
2018-06-26T21:42:17.022-0400	connected to: localhost
2018-06-26T21:42:20.001-0400	[####....................] scrape.right	296MB/1.66GB (17.5%)
^C
2018-06-26T21:42:20.737-0400	signal 'interrupt' received; forcefully terminating
Collection right in scrape database created
guardian.json
2018-06-26T21:42:20.914-0400	connected to: localhost


In [180]:
FILE = 'msnbc.json'

addToDB(DB_NAME,COL_NAME,PATH,FILE)

In [176]:
docs_parser(htmlCol)

5b243fb2897e82028b8ffe50:
has same meta
----------
5b243fb7897e82028b8ffe51:
has same meta
----------
5b243fbc897e82028b8ffe52:
has same meta
----------
5b243fc1897e82028b8ffe53:
has same meta
----------
5b243fc6897e82028b8ffe54:
has same meta
----------
5b243fd0897e82028b8ffe55:
has same meta
----------
5b243fd5897e82028b8ffe56:
has same meta
----------
5b243fda897e82028b8ffe57:
has same meta
----------
5b243ff1897e82028b8ffe5b:
has same meta
----------
5b243fec897e82028b8ffe5a:
has same meta
----------
5b243fe0897e82028b8ffe58:
has same meta
----------
5b243fe7897e82028b8ffe59:
has same meta
----------
5b244001897e82028b8ffe5e:
has same meta
----------
5b243ff6897e82028b8ffe5c:
has same meta
----------
5b243ffc897e82028b8ffe5d:
has same meta
----------
5b244007897e82028b8ffe5f:
has same meta
----------
5b244011897e82028b8ffe61:
has same meta
----------
5b24401b897e82028b8ffe63:
has same meta
----------
5b24400c897e82028b8ffe60:
has same meta
----------
5b244025897e82028b8ffe65:
has s

KeyboardInterrupt: 

In [122]:
show_doc('5b243fb2897e82028b8ffe50')

_id : 5b243fbc897e82028b8ffe52
url : https://www.msnbc.com/all-in/watch/white-house-can-t-fill-open-positions-turns-to-job-fair-125617056
html : <!DOCTYPE html><html lang="en" data-reactroot=""><head><title data-rh="true">White House can&#x27;t 
