In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# dependencies
import pymongo as pm

In [3]:
def addToDB(DB_NAME,COL_NAME,PATH,FILE):
    '''
    Imports a file into mongoDB
    
    Parameters:
    --------
    DB_NAME : Name of the database to connect to
    COL_NAME: Name of the collection to create
    PATH    : Path to folder with the file
    FILE  : Filename
    
    Returns:
    --------
    Collection COL_NAME in DB_NAME database
    '''
    !mongoimport --db {DB_NAME} --collection {COL_NAME} --file {PATH+FILE} --batchSize 1
    print(f'Collection {COL_NAME} in {DB_NAME} database created')

In [4]:
def feature_extractor(html):
    '''
    Parse html using newspaper
    
    Parameters:
    --------
    html   : 'string'
    
    Returns:
    --------
    result : dictionary
    
    '''
    import newspaper
    from bs4 import BeautifulSoup as bs
    from datetime import datetime
    
    article = newspaper.Article('')
    article.set_html(html)
    
    try:
        article.build()
    except Exception as e:
        print(f'feature_extractor: None features found. Exception: {e}')
        return {'text':''}
        
    
    # parse date manually if it wasn't found by newspaper
    if not article.publish_date:
        sp = bs(html, 'lxml')
        # 'dek___3AQpw' class appears on 30% of msnbc websites
        try:
            publish_date = datetime.strptime(\
                               (sp.find('p', class_='dek___3AQpw').span.text),\
                               '%b.%d.%Y'\
                            )
        except:
            publish_date = ''
    else:
        publish_date = article.publish_date


    return {
            'date'    :publish_date,
            'title'   :article.title,
            'text'    :article.text,
            'authors' :article.authors,
            'keywords':article.keywords
    }

In [22]:
def docs_parser(htmlCol, skip=True):
    '''
    Parse mongo docs, extract features and update the doc with the features
    
    Parameters:
    --------
    htmlCol : mongodb collection, has to have documents with 'html' key
    skip    : skip html processing if meta key exists in a record, default "True"
    
    Returns:
    --------
    updates all documents in the collection
    '''
    
    try: 
        for doc in htmlCol.find():
            print(f"{htmlCol.name}:{doc['_id']}:")

            if 'html' in doc:
                if 'meta' in doc and skip: 
                    print('Meta exists, skipping')
                else:
                    # extract metadata from html
                    meta = feature_extractor(doc['html'])

                    try:
                        if meta == doc['meta']:
                            print(f"has same meta")
                    except:
                        # if there is newer meta data or meta key is not existing
                        htmlCol.update_one(
                            {'_id':ObjectId(doc['_id'])},
                            {'$set' : {
                                      'meta' : meta
                                      }
                            }
                        )
                        print(f"saved meta")
            else:
                print(f"does not have html")
            print('----------')
    except:
        print(f"docs_parser: couldn't find docs in collection {htmlCol.name}")

In [6]:
def show_doc(db, collection, id):
    '''
    Finds a document by 'id' and prints contents to the console
    
    Parameters:
    --------
    db         : database name
    collection : mongodb collection
    id         : mongodb document id
    
    Returns:
    --------
    Prints first 100 symbols of each document's key to console
    '''
    from bson.objectid import ObjectId
    doc = db['collection'].find_one({'_id':ObjectId(id)})
    for k in doc:
        print(f"{k} : {str(doc[k])[:100]}")

Some comands to keep dbs clean

In [None]:
# deletes all 'meta' fields from all docs
# htmlCol.update({}, {$unset: {meta:1}}, false, true); # mongo shell comand
htmlCol.update({}, {'$unset': {'meta':1}}, multi=True) # pymongo way


In [None]:
# leaves only unique documents by 'url' field

htmlCol.create_index(
    "url",
    unique=True
)

In [None]:
# pymongo 'find' returns cursor that allows iterating through results
# calling first object [0] allows accessing the dictionary with results
# the ['html'] is the key in the dictionary
html = htmlCol.find({'url':'http://www.msnbc.com/velshi-ruhle/watch/jeff-sessions-is-justifying-harsh-immigration-policy-with-the-bible-1256689731629'},\
            projection={'html':True, '_id':False})[0]['html']

In [None]:
# find documents NOT containing a 'tag': regex expression
import re
tag = re.compile('dek___3AQpw.')
docs = htmlCol.find({"html" : {'$not': tag}})
for d in docs[:20]: print(d['url'])


In [None]:
??test

In [None]:
test = newspaper.Article(url='http://www.msnbc.com/rachel-maddow-show/defamation-lawsuit-grows-more-serious-trump')
test.build()

In [None]:
print(test.publish_date)

In [None]:
test.date

In [None]:
test.meta_data

# **Production code**

In [16]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn, maxPoolSize=200)

# define db 
DB_NAME = 'scrape'
db = client[DB_NAME]

In [None]:
# import json files into db
path = 'db/json/by_media/'
left = 'left/'
lefts = !ls {path + left}

for file in lefts: 
    print(file)
    addToDB(DB_NAME, left[:-1], path+left, file)

In [None]:
# import json files into db
path = 'db/json/by_media/'
right = 'right/'
rights = !ls {path + right}

for file in rights: 
    print(file)
    addToDB(DB_NAME, right[:-1], path+right, file)

In [17]:
for collection in ['left','right']:
    docs_parser(db[collection])

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2921ade9541e01584077b3:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2921a7e9541e01584077b2:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2921b3e9541e01584077b4:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2921a1e9541e01584077b1:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2921b9e9541e01584077b5:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29240ee9541e0158407817:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292408e9541e0158407816:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292414e9541e0158407818:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29242ce9541e015840781c:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292420e9541e015840781a:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2925c3e9541e0158407860:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2925f4e9541e0158407868:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2925cfe9541e0158407862:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2925eee9541e0158407867:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2925fae9541e0158407869:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29279ce9541e01584078ad:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292783e9541e01584078a9:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2927b3e9541e01584078b1:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2927b9e9541e01584078b2:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2927c5e9541e01584078b4:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2929bbe9541e0158407907:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2929c1e9541e0158407908:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2929d2e9541e015840790b:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2929d7e9541e015840790c:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2929b5e9541e0158407906:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d62e9541e015840799c:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d75e9541e015840799f:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d88e9541e01584079a2:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d6fe9541e015840799e:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d5ce9541e015840799b:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292faae9541e01584079fa:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292fb7e9541e01584079fc:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292fcee9541e0158407a00:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292fd4e9541e0158407a01:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292fe6e9541e0158407a04:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293412e9541e0158407ab2:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29341fe9541e0158407ab4:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293436e9541e0158407ab8:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29344de9541e0158407abc:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293442e9541e0158407aba:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292760e9541e29f82c77a2:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2927cfe9541e29f82c77a4:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2927d5e9541e29f82c77a5:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2927e6e9541e29f82c77a8:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2927dbe9541e29f82c77a6:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d3ee9541e29f82c7867:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d44e9541e29f82c7868:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d49e9541e29f82c7869:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d4fe9541e29f82c786a:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b292d55e9541e29f82c786b:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293463e9541e29f82c7972:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293469e9541e29f82c7973:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293474e9541e29f82c7974:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29347ae9541e29f82c7975:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293480e9541e29f82c7976:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29360ae9541e29f82c7998:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293bb7e9541e29f82c7a59:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293bbde9541e29f82c7a5a:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293bc8e9541e29f82c7a5b:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b293bcee9541e29f82c7a5c:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294186e9541e29f82c7b2b:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29418ce9541e29f82c7b2c:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294193e9541e29f82c7b2d:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294199e9541e29f82c7b2e:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29419fe9541e29f82c7b2f:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2946d0e9541e29f82c7bea:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2946d5e9541e29f82c7beb:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2946dbe9541e29f82c7bec:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2946e1e9541e29f82c7bed:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2946e7e9541e29f82c7bee:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294c50e9541e29f82c7cab:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294c5fe9541e29f82c7cad:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294c65e9541e29f82c7cae:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294c6be9541e29f82c7caf:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294c56e9541e29f82c7cac:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294fa0e9541e29f82c7d1e:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294fabe9541e29f82c7d20:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294fb1e9541e29f82c7d21:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294fbae9541e29f82c7d22:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b294fc1e9541e29f82c7d23:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295560e9541e29f82c7de5:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295566e9541e29f82c7de6:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29556ce9541e29f82c7de7:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29557fe9541e29f82c7de9:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295572e9541e29f82c7de8:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295b21e9541e29f82c7ea6:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295b28e9541e29f82c7ea7:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295b2ee9541e29f82c7ea8:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295b36e9541e29f82c7ea9:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295b3ce9541e29f82c7eaa:
Meta exi

----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295f9de9541e29f82c7f46:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295fd1e9541e29f82c7f4e:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295fd6e9541e29f82c7f4f:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295fdce9541e29f82c7f50:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b295fefe9541e29f82c7f5

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b296480e9541e29f82c7ff3:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b296487e9541e29f82c7ff4:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29648ee9541e29f82c7ff5:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b296495e9541e29f82c7ff6:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29649be9541e29f82c7ff7:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b296bf8e9541e29f82c80f4:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b296c03e9541e29f82c80f5:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b296c0ae9541e29f82c80f6:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b296c10e9541e29f82c80f7:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b296c17e9541e29f82c80f8:
Meta exi

Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2971eae9541e29f82c81c0:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2971efe9541e29f82c81c1:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b2971f5e9541e29f82c81c2:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b297207e9541e29f82c81c4:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b297637e9541e29f82c824f:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b297a9fe9541e29f82c82dd:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b297aa6e9541e29f82c82de:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b297ab6e9541e29f82c82df:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b297abce9541e29f82c82e0:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b297e4ce9541e29f82c8357:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298021e9541e29f82c8399:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298013e9541e29f82c8398:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298032e9541e29f82c839a:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298038e9541e29f82c839b:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29842fe9541e29f82c840d:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298788e9541e29f82c8479:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29878ee9541e29f82c847a:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298782e9541e29f82c8478:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298796e9541e29f82c847b:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298d4de9541e29f82c8537:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298d53e9541e29f82c8538:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298d59e9541e29f82c8539:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298d64e9541e29f82c853a:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b298d6ae9541e29f82c853b:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29967ee9541e29f82c8670:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b299684e9541e29f82c8671:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b299692e9541e29f82c8673:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b299698e9541e29f82c8674:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b29968be9541e29f82c8672:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b299b7ae9541e29f82c871b:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b299b80e9541e29f82c871c:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b299b86e9541e29f82c871d:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b299b8ce9541e29f82c871e:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'left'):5b299b91e9541e29f82c871f:
Meta exi

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'right'):5b256deaa4b51f412016c2fc:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'right'):5b256de5a4b51f412016c2fb:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'right'):5b256defa4b51f412016c2fd:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'right'):5b256df4a4b51f412016c2fe:
Meta exists, skipping
----------
Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, maxpoolsize=200), 'scrape'), 'right'):5b256df9a4b51f412016c2ff:
Met

In [24]:
from multiprocessing import Process

# use multiprocessing to extract features
def func():
    DB_NAME = 'scrape'
    db = pm.MongoClient(host='localhost', port=27017, maxPoolSize=500)[DB_NAME]

    for collection in ['left','right']: docs_parser(db[collection])

proc = Process(target=func)
proc.start()

left:5b2921ade9541e01584077b3:
Meta exists, skipping
----------
left:5b2921a7e9541e01584077b2:
Meta exists, skipping
----------
left:5b2921b3e9541e01584077b4:
Meta exists, skipping
----------
left:5b2921a1e9541e01584077b1:
Meta exists, skipping
----------
left:5b2921b9e9541e01584077b5:
Meta exists, skipping
----------
left:5b2921c5e9541e01584077b7:
Meta exists, skipping
----------
left:5b2921bfe9541e01584077b6:
Meta exists, skipping
----------
left:5b2921cbe9541e01584077b8:
Meta exists, skipping
----------
left:5b2921d6e9541e01584077ba:
Meta exists, skipping
----------
left:5b2921d0e9541e01584077b9:
Meta exists, skipping
----------
left:5b2921dce9541e01584077bb:
Meta exists, skipping
----------
left:5b2921e2e9541e01584077bc:
Meta exists, skipping
----------
left:5b2921eee9541e01584077be:
Meta exists, skipping
----------
left:5b2921e8e9541e01584077bd:
Meta exists, skipping
----------
left:5b2921f4e9541e01584077bf:
Meta exists, skipping
----------
left:5b2921fae9541e01584077c0:
Meta exis